From 54048c2a7272f254c56b29cfab26eaab84aacbeb Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 12 Mar 2024 16:27:29 +0000 Subject: [PATCH 01/72] Reorder functions --- .../ensembl/io/genomio/gff3/simplifier.py | 254 +++++++++--------- 1 file changed, 128 insertions(+), 126 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index a7afefe14..9229e2a35 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -160,9 +160,9 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: # Create actual genes from transcripts/CDS top level features if gene.type in transcript_types: - gene = self.transcript_gene(gene) + gene = self.create_gene_for_lone_transcript(gene) elif gene.type == "CDS": - gene = self.cds_gene(gene) + gene = self.create_gene_for_lone_cds(gene) # What to do with unsupported gene types if gene.type not in allowed_gene_types: @@ -175,42 +175,54 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: self.annotations.store_gene(gene) return self.clean_gene(gene) - def clean_gene(self, gene: SeqFeature) -> SeqFeature: - """Return the same gene without qualifiers unrelated to the gene structure.""" + # COMPLETION + def create_gene_for_lone_transcript(self, ncrna: SeqFeature) -> SeqFeature: + """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA' for all others - old_gene_qualifiers = gene.qualifiers - try: - gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} - except KeyError as err: - raise KeyError(f"Missing source for {gene.id}") from err - for transcript in gene.sub_features: - # Replace qualifiers - old_transcript_qualifiers = transcript.qualifiers - transcript.qualifiers = { - "ID": transcript.id, - "Parent": gene.id, - } - if "source" in old_transcript_qualifiers: - transcript.qualifiers["source"] = old_transcript_qualifiers["source"] + Args: + ncrna: the transcript for which we want to create a gene. - for feat in transcript.sub_features: - old_qualifiers = feat.qualifiers - feat.qualifiers = { - "ID": feat.id, - "Parent": transcript.id, - "source": old_qualifiers["source"], - } - if feat.type == "CDS": - try: - feat.qualifiers["phase"] = old_qualifiers["phase"] - except KeyError as err: - raise KeyError( - f"Missing phase for gene {gene.type} {gene.id}, CDS {feat.id} ({old_qualifiers})" - ) from err + Returns: + The gene that contains the transcript. + + """ + new_type = "ncRNA_gene" + if ncrna.type in ("tRNA", "rRNA"): + new_type = "gene" + logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature") + gene = SeqFeature(ncrna.location, type=new_type) + gene.qualifiers["source"] = ncrna.qualifiers["source"] + gene.sub_features = [ncrna] + gene.id = ncrna.id + + return gene + + def create_gene_for_lone_cds(self, cds: SeqFeature) -> SeqFeature: + """Returns a gene created for a lone CDS.""" + + logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}") + + # Create a transcript, add the CDS + transcript = SeqFeature(cds.location, type="mRNA") + transcript.qualifiers["source"] = cds.qualifiers["source"] + transcript.sub_features = [cds] + + # Add an exon too + exon = SeqFeature(cds.location, type="exon") + exon.qualifiers["source"] = cds.qualifiers["source"] + transcript.sub_features.append(exon) + + # Create a gene, add the transcript + gene_type = "gene" + if ("pseudo" in cds.qualifiers) and (cds.qualifiers["pseudo"][0] == "true"): + gene_type = "pseudogene" + gene = SeqFeature(cds.location, type=gene_type) + gene.qualifiers["source"] = cds.qualifiers["source"] + gene.sub_features = [transcript] + gene.id = self.stable_ids.generate_gene_id() return gene - # FORMATTERS def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: """Given a mobile_genetic_element feature, transform it into a transposable_element""" @@ -252,30 +264,41 @@ def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: return feat - def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: - """Returns the equivalent Ensembl biotype feature for gene segment transcript features. + # GENES + def clean_gene(self, gene: SeqFeature) -> SeqFeature: + """Return the same gene without qualifiers unrelated to the gene structure.""" - Supported features: "C_gene_segment" and "V_gene_segment". + old_gene_qualifiers = gene.qualifiers + try: + gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} + except KeyError as err: + raise KeyError(f"Missing source for {gene.id}") from err + for transcript in gene.sub_features: + # Replace qualifiers + old_transcript_qualifiers = transcript.qualifiers + transcript.qualifiers = { + "ID": transcript.id, + "Parent": gene.id, + } + if "source" in old_transcript_qualifiers: + transcript.qualifiers["source"] = old_transcript_qualifiers["source"] - Args: - transcript: Gene segment transcript feature. + for feat in transcript.sub_features: + old_qualifiers = feat.qualifiers + feat.qualifiers = { + "ID": feat.id, + "Parent": transcript.id, + "source": old_qualifiers["source"], + } + if feat.type == "CDS": + try: + feat.qualifiers["phase"] = old_qualifiers["phase"] + except KeyError as err: + raise KeyError( + f"Missing phase for gene {gene.type} {gene.id}, CDS {feat.id} ({old_qualifiers})" + ) from err - """ - # Change V/C_gene_segment into a its corresponding transcript names - if transcript.type in ("C_gene_segment", "V_gene_segment"): - standard_name = transcript.qualifiers["standard_name"][0] - biotype = transcript.type.replace("_segment", "") - if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): - biotype = f"IG_{biotype}" - elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): - biotype = f"TR_{biotype}" - else: - logging.warning( - f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}" - ) - return transcript - transcript.type = biotype - return transcript + return gene def normalize_gene(self, gene: SeqFeature) -> SeqFeature: """Returns a normalized gene structure, separate from the functional elements. @@ -293,6 +316,7 @@ def normalize_gene(self, gene: SeqFeature) -> SeqFeature: return gene + # PSEUDOGENES def normalize_pseudogene(self, gene: SeqFeature) -> None: """Normalize CDSs if allowed, otherwise remove them.""" if gene.type != "pseudogene": @@ -303,6 +327,31 @@ def normalize_pseudogene(self, gene: SeqFeature) -> None: else: self.remove_cds_from_pseudogene(gene) + def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None: + """Removes the CDS from a pseudogene. + + This assumes the CDSs are sub features of the transcript or the gene. + + """ + if gene.type != "pseudogene": + return + + gene_subfeats = [] + for transcript in gene.sub_features: + if transcript.type == "CDS": + logging.debug(f"Remove pseudo CDS {transcript.id}") + continue + new_subfeats = [] + for feat in transcript.sub_features: + if feat.type == "CDS": + logging.debug(f"Remove pseudo CDS {feat.id}") + continue + new_subfeats.append(feat) + transcript.sub_features = new_subfeats + gene_subfeats.append(transcript) + gene.sub_features = gene_subfeats + + # TRANSCRIPTS def normalize_transcripts(self, gene: SeqFeature) -> None: """Normalizes a transcript.""" @@ -335,6 +384,31 @@ def normalize_transcripts(self, gene: SeqFeature) -> None: for elt in sorted(transcripts_to_delete, reverse=True): gene.sub_features.pop(elt) + def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: + """Returns the equivalent Ensembl biotype feature for gene segment transcript features. + + Supported features: "C_gene_segment" and "V_gene_segment". + + Args: + transcript: Gene segment transcript feature. + + """ + # Change V/C_gene_segment into a its corresponding transcript names + if transcript.type in ("C_gene_segment", "V_gene_segment"): + standard_name = transcript.qualifiers["standard_name"][0] + biotype = transcript.type.replace("_segment", "") + if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): + biotype = f"IG_{biotype}" + elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): + biotype = f"TR_{biotype}" + else: + logging.warning( + f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}" + ) + return transcript + transcript.type = biotype + return transcript + def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFeature) -> SeqFeature: """Returns a transcript with normalized sub-features.""" ignored_transcript_types = self._biotypes["transcript"]["ignored"] @@ -372,75 +446,3 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea for elt in sorted(exons_to_delete, reverse=True): transcript.sub_features.pop(elt) return transcript - - # COMPLETION - def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature: - """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA' for all others - - Args: - ncrna: the transcript for which we want to create a gene. - - Returns: - The gene that contains the transcript. - - """ - new_type = "ncRNA_gene" - if ncrna.type in ("tRNA", "rRNA"): - new_type = "gene" - logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature") - gene = SeqFeature(ncrna.location, type=new_type) - gene.qualifiers["source"] = ncrna.qualifiers["source"] - gene.sub_features = [ncrna] - gene.id = ncrna.id - - return gene - - def cds_gene(self, cds: SeqFeature) -> SeqFeature: - """Returns a gene created for a lone CDS.""" - - logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}") - - # Create a transcript, add the CDS - transcript = SeqFeature(cds.location, type="mRNA") - transcript.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features = [cds] - - # Add an exon too - exon = SeqFeature(cds.location, type="exon") - exon.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features.append(exon) - - # Create a gene, add the transcript - gene_type = "gene" - if ("pseudo" in cds.qualifiers) and (cds.qualifiers["pseudo"][0] == "true"): - gene_type = "pseudogene" - gene = SeqFeature(cds.location, type=gene_type) - gene.qualifiers["source"] = cds.qualifiers["source"] - gene.sub_features = [transcript] - gene.id = self.stable_ids.generate_gene_id() - - return gene - - def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None: - """Removes the CDS from a pseudogene. - - This assumes the CDSs are sub features of the transcript or the gene. - - """ - if gene.type != "pseudogene": - return - - gene_subfeats = [] - for transcript in gene.sub_features: - if transcript.type == "CDS": - logging.debug(f"Remove pseudo CDS {transcript.id}") - continue - new_subfeats = [] - for feat in transcript.sub_features: - if feat.type == "CDS": - logging.debug(f"Remove pseudo CDS {feat.id}") - continue - new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) - gene.sub_features = gene_subfeats From a8b44dab69037cce2de2dc3f4c0436d7d03d61bb Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 12 Mar 2024 16:28:08 +0000 Subject: [PATCH 02/72] Load exception --- src/python/ensembl/io/genomio/gff3/simplifier.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 9229e2a35..dbabffcbf 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -18,7 +18,6 @@ __all__ = [ "Records", - "GFFParserError", "GFFSimplifier", ] @@ -39,6 +38,7 @@ from .extract_annotation import FunctionalAnnotations from .id_allocator import StableIDAllocator from .restructure import restructure_gene +from .exceptions import GFFParserError class Records(list): @@ -54,10 +54,6 @@ def to_gff(self, out_gff_path: PathLike) -> None: GFF.write(self, out_gff_fh) -class GFFParserError(Exception): - """Error when parsing a GFF3 file.""" - - class GFFSimplifier: """Parse a GGF3 file and output a cleaned up GFF3 + annotation json file. From c912b4a59315d8dc0033d3001f1478d7766c2cdb Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 12 Mar 2024 16:47:20 +0000 Subject: [PATCH 03/72] Fail type as a set --- src/python/ensembl/io/genomio/gff3/simplifier.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index dbabffcbf..eb3b58408 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -26,7 +26,7 @@ from os import PathLike from pathlib import Path import re -from typing import Dict, List, Optional +from typing import List, Optional, Set from BCBio import GFF from Bio.SeqRecord import SeqRecord @@ -94,7 +94,7 @@ def __init__( self.stable_ids = StableIDAllocator() self.stable_ids.set_prefix(self.genome) self.exclude_seq_regions: List = [] - self.fail_types: Dict[str, int] = {} + self.fail_types: Set = {} # Init the actual data we will store self.records = Records() @@ -162,7 +162,7 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: # What to do with unsupported gene types if gene.type not in allowed_gene_types: - self.fail_types["gene=" + gene.type] = 1 + self.fail_types.add(f"gene={gene.type}") logging.debug(f"Unsupported gene type: {gene.type} (for {gene.id})") return None @@ -360,7 +360,7 @@ def normalize_transcripts(self, gene: SeqFeature) -> None: transcript.type not in allowed_transcript_types and transcript.type not in ignored_transcript_types ): - self.fail_types["transcript=" + transcript.type] = 1 + self.fail_types.add(f"transcript={transcript.type}") logging.warning( f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})" ) @@ -430,7 +430,7 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea exons_to_delete.append(tcount) continue - self.fail_types[f"sub_transcript={feat.type}"] = 1 + self.fail_types.add(f"sub_transcript={feat.type}") logging.warning( f"Unrecognized exon type for {feat.type}: {feat.id}" f" (for transcript {transcript.id} of type {transcript.type})" From 61fa9450413927a3dbbc97330284c0ac441454aa Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 10:45:37 +0000 Subject: [PATCH 04/72] Move gff loading in record, add test --- .../ensembl/io/genomio/gff3/simplifier.py | 57 +++++++++-------- src/python/tests/gff3/test_records.py | 63 +++++++++++++++++++ .../tests/gff3/test_records/record_n1.gff | 3 + .../tests/gff3/test_records/record_n2.gff | 5 ++ 4 files changed, 101 insertions(+), 27 deletions(-) create mode 100644 src/python/tests/gff3/test_records.py create mode 100644 src/python/tests/gff3/test_records/record_n1.gff create mode 100644 src/python/tests/gff3/test_records/record_n2.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index eb3b58408..9cd4c0e0d 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -44,12 +44,21 @@ class Records(list): """List of GFF3 SeqRecords.""" - def to_gff(self, out_gff_path: PathLike) -> None: - """Print out the current list of records in a GFF3 file. + def from_gff(self, in_gff_path: PathLike, excluded: Optional[List[str]] = None) -> None: + """Load records from a GFF3 file.""" + if excluded is None: + excluded = [] + with Path(in_gff_path).open("r") as in_gff_fh: + for record in GFF.parse(in_gff_fh): + if record.id in excluded: + logging.debug(f"Skip seq_region {record.id}") + continue + clean_record = SeqRecord(record.seq, id=record.id) + clean_record.features = record.features + self.append(clean_record) - Args: - out_gff_path: Path to GFF3 file where to write the records. - """ + def to_gff(self, out_gff_path: PathLike) -> None: + """Print out the current list of records in a GFF3 file.""" with Path(out_gff_path).open("w") as out_gff_fh: GFF.write(self, out_gff_fh) @@ -93,7 +102,7 @@ def __init__( # Other preparations self.stable_ids = StableIDAllocator() self.stable_ids.set_prefix(self.genome) - self.exclude_seq_regions: List = [] + self.exclude_seq_regions: List[str] = [] self.fail_types: Set = {} # Init the actual data we will store @@ -104,26 +113,20 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: """Loads a GFF3 from INSDC and rewrites it in a simpler version, whilst also writing a functional annotation file. """ - - with Path(in_gff_path).open("r") as in_gff_fh: - for record in GFF.parse(in_gff_fh): - if record.id in self.exclude_seq_regions: - logging.debug(f"Skip seq_region {record.id}") - continue - - # Clean all root features and make clean record - clean_record = SeqRecord(record.seq, id=record.id) - for feature in record.features: - clean_feature = self.simpler_gff3_feature(feature) - if clean_feature is not None: - clean_record.features.append(clean_feature) - self.records.append(clean_record) - - if self.fail_types: - fail_errors = "\n ".join(self.fail_types.keys()) - logging.warning(f"Unrecognized types found:\n {fail_errors}") - if not self.skip_unrecognized: - raise GFFParserError("Unrecognized types found, abort") + self.records.from_gff(in_gff_path, self.exclude_seq_regions) + for record in self.records: + cleaned_features = [] + for feature in record.features: + clean_feature = self.simpler_gff3_feature(feature) + if clean_feature is not None: + cleaned_features.append(clean_feature) + record.features = cleaned_features + + if self.fail_types: + fail_errors = "\n ".join(self.fail_types.keys()) + logging.warning(f"Unrecognized types found:\n {fail_errors}") + if not self.skip_unrecognized: + raise GFFParserError("Unrecognized types found, abort") def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: """Creates a simpler version of a GFF3 feature. @@ -173,7 +176,7 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: # COMPLETION def create_gene_for_lone_transcript(self, ncrna: SeqFeature) -> SeqFeature: - """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA' for all others + """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others Args: ncrna: the transcript for which we want to create a gene. diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py new file mode 100644 index 000000000..a86e90910 --- /dev/null +++ b/src/python/tests/gff3/test_records.py @@ -0,0 +1,63 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.gff3.simplifier` Record object.""" + +from contextlib import nullcontext as does_not_raise +from os import PathLike +from pathlib import Path +from typing import Any, Callable, ContextManager, Dict, List, Optional, Union + +from Bio.SeqFeature import SeqFeature, SimpleLocation +import pytest +from pytest import param, raises + +from ensembl.io.genomio.gff3.simplifier import Records + + +@pytest.mark.parametrize( + "in_gff, excluded, expected_loaded", + [ + param("record_n1.gff", None, ["scaffold1"], id="1 record"), + param("record_n2.gff", None, ["scaffold1", "scaffold2"], id="2 records"), + param("record_n2.gff", ["scaffold1"], ["scaffold2"], id="2 records, exclude 1"), + param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), + ], +) +def test_from_gff(tmp_dir: Path, data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str]) -> None: + input = data_dir / in_gff + + records = Records() + if excluded is None: + records.from_gff(input) + else: + records.from_gff(input, excluded) + record_names = [record.id for record in records] + assert record_names == expected_loaded + + +@pytest.mark.parametrize( + "in_gff", + [ + param("record_n1.gff", id="1 record"), + param("record_n2.gff", id="2 records"), + ], +) +def test_to_gff(tmp_dir: Path, data_dir: Path, assert_files: Callable, in_gff: PathLike) -> None: + input = data_dir / in_gff + output = tmp_dir / in_gff + records = Records() + records.from_gff(input) + records.to_gff(output) + assert_files(input, output) diff --git a/src/python/tests/gff3/test_records/record_n1.gff b/src/python/tests/gff3/test_records/record_n1.gff new file mode 100644 index 000000000..8bd717f15 --- /dev/null +++ b/src/python/tests/gff3/test_records/record_n1.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_records/record_n2.gff b/src/python/tests/gff3/test_records/record_n2.gff new file mode 100644 index 000000000..82996e1d8 --- /dev/null +++ b/src/python/tests/gff3/test_records/record_n2.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +##sequence-region scaffold2 1 1000 +scaffold2 Source gene 1 1000 . - . ID=LOREMIPSUM2 From 9dc0eb7d88644ae636dc6ee58cdb20f84596a922 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 10:56:01 +0000 Subject: [PATCH 05/72] test loading invalid gff3 --- src/python/tests/gff3/test_records.py | 9 ++++++++- src/python/tests/gff3/test_records/invalid.gff3 | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 src/python/tests/gff3/test_records/invalid.gff3 diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index a86e90910..90b8e8cba 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -35,7 +35,7 @@ param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), ], ) -def test_from_gff(tmp_dir: Path, data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str]) -> None: +def test_from_gff(data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str]) -> None: input = data_dir / in_gff records = Records() @@ -47,6 +47,13 @@ def test_from_gff(tmp_dir: Path, data_dir: Path, in_gff: PathLike, excluded: Opt assert record_names == expected_loaded +def test_from_gff_invalid(data_dir: Path) -> None: + input = data_dir / "invalid.gff3" + records = Records() + with raises(AssertionError): + records.from_gff(input) + + @pytest.mark.parametrize( "in_gff", [ diff --git a/src/python/tests/gff3/test_records/invalid.gff3 b/src/python/tests/gff3/test_records/invalid.gff3 new file mode 100644 index 000000000..f1f10f6d2 --- /dev/null +++ b/src/python/tests/gff3/test_records/invalid.gff3 @@ -0,0 +1,2 @@ +>Lorem ipsum +HAHAHA From 4ce2c1c827d8c6f92f251029f8e8a844a6139e04 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 11:00:32 +0000 Subject: [PATCH 06/72] Fixes --- .../ensembl/io/genomio/gff3/simplifier.py | 2 +- src/python/tests/gff3/test_records.py | 55 ++++++++++--------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 9cd4c0e0d..da989fc7c 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -103,7 +103,7 @@ def __init__( self.stable_ids = StableIDAllocator() self.stable_ids.set_prefix(self.genome) self.exclude_seq_regions: List[str] = [] - self.fail_types: Set = {} + self.fail_types: Set = set() # Init the actual data we will store self.records = Records() diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 90b8e8cba..6de6d0683 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -14,12 +14,10 @@ # limitations under the License. """Unit testing of `ensembl.io.genomio.gff3.simplifier` Record object.""" -from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Any, Callable, ContextManager, Dict, List, Optional, Union +from typing import Callable, List, Optional -from Bio.SeqFeature import SeqFeature, SimpleLocation import pytest from pytest import param, raises @@ -27,44 +25,49 @@ @pytest.mark.parametrize( - "in_gff, excluded, expected_loaded", - [ - param("record_n1.gff", None, ["scaffold1"], id="1 record"), - param("record_n2.gff", None, ["scaffold1", "scaffold2"], id="2 records"), - param("record_n2.gff", ["scaffold1"], ["scaffold2"], id="2 records, exclude 1"), - param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), - ], + "in_gff, excluded, expected_loaded", + [ + param("record_n1.gff", None, ["scaffold1"], id="1 record"), + param("record_n2.gff", None, ["scaffold1", "scaffold2"], id="2 records"), + param("record_n2.gff", ["scaffold1"], ["scaffold2"], id="2 records, exclude 1"), + param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), + ], ) -def test_from_gff(data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str]) -> None: - input = data_dir / in_gff +def test_from_gff( + data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str] +) -> None: + """Test loading GFF records from file.""" + input_gff = data_dir / in_gff records = Records() if excluded is None: - records.from_gff(input) + records.from_gff(input_gff) else: - records.from_gff(input, excluded) + records.from_gff(input_gff, excluded) record_names = [record.id for record in records] assert record_names == expected_loaded def test_from_gff_invalid(data_dir: Path) -> None: - input = data_dir / "invalid.gff3" + """Test loading invalid GFF file.""" + input_gff = data_dir / "invalid.gff3" records = Records() with raises(AssertionError): - records.from_gff(input) + records.from_gff(input_gff) @pytest.mark.parametrize( - "in_gff", - [ - param("record_n1.gff", id="1 record"), - param("record_n2.gff", id="2 records"), - ], + "in_gff", + [ + param("record_n1.gff", id="1 record"), + param("record_n2.gff", id="2 records"), + ], ) def test_to_gff(tmp_dir: Path, data_dir: Path, assert_files: Callable, in_gff: PathLike) -> None: - input = data_dir / in_gff - output = tmp_dir / in_gff + """Test writing GFF records to file.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff records = Records() - records.from_gff(input) - records.to_gff(output) - assert_files(input, output) + records.from_gff(input_gff) + records.to_gff(output_gff) + assert_files(input_gff, output_gff) From 1183164dd95c0ebe4a6a08ef7a3481e49b445290 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 11:13:52 +0000 Subject: [PATCH 07/72] First test for simplifier --- src/python/tests/gff3/test_simplifier.py | 51 +++++++++++++++++++ .../tests/gff3/test_simplifier/ok_genes.gff | 6 +++ .../gff3/test_simplifier/ok_genes_simped.gff | 6 +++ 3 files changed, 63 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier.py create mode 100644 src/python/tests/gff3/test_simplifier/ok_genes.gff create mode 100644 src/python/tests/gff3/test_simplifier/ok_genes_simped.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py new file mode 100644 index 000000000..98e5ffa37 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier.py @@ -0,0 +1,51 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.gff3.restructure` module.""" + +from contextlib import nullcontext as does_not_raise +from os import PathLike +from pathlib import Path +from typing import Any, Callable, ContextManager, Dict, List, Union + +# from Bio.SeqFeature import SeqFeature, SimpleLocation +import pytest +from pytest import param, raises + +# from ensembl.io.genomio.gff3.exceptions import GFFParserError +from ensembl.io.genomio.gff3.simplifier import GFFSimplifier + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), + ], +) +def test_simpler_gff3( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff + with expectation: + simp = GFFSimplifier() + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) diff --git a/src/python/tests/gff3/test_simplifier/ok_genes.gff b/src/python/tests/gff3/test_simplifier/ok_genes.gff new file mode 100644 index 000000000..571f16d05 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_genes.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/ok_genes_simped.gff b/src/python/tests/gff3/test_simplifier/ok_genes_simped.gff new file mode 100644 index 000000000..943179766 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_genes_simped.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 From a05e7571eeb5659574f4b31ebd97abf8ce5afa20 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 11:33:03 +0000 Subject: [PATCH 08/72] Add tests for renaming checks --- src/python/tests/gff3/test_simplifier.py | 37 ++++++++++++++++++- .../gff3/test_simplifier/bad_gene_type.gff | 3 ++ .../gff3/test_simplifier/bad_subtr_type.gff | 5 +++ .../gff3/test_simplifier/bad_tr_type.gff | 4 ++ .../gff3/test_simplifier/genes_badnames.gff | 6 +++ .../genes_badnames_brc4name.gff | 6 +++ .../test_simplifier/genes_badnames_noname.gff | 6 +++ .../gff3/test_simplifier/genome_brc4.json | 20 ++++++++++ .../gff3/test_simplifier/genome_no_brc4.json | 16 ++++++++ 9 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/bad_gene_type.gff create mode 100644 src/python/tests/gff3/test_simplifier/bad_subtr_type.gff create mode 100644 src/python/tests/gff3/test_simplifier/bad_tr_type.gff create mode 100644 src/python/tests/gff3/test_simplifier/genes_badnames.gff create mode 100644 src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff create mode 100644 src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff create mode 100644 src/python/tests/gff3/test_simplifier/genome_brc4.json create mode 100644 src/python/tests/gff3/test_simplifier/genome_no_brc4.json diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 98e5ffa37..21c4996e9 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -17,13 +17,13 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Any, Callable, ContextManager, Dict, List, Union +from typing import Any, Callable, ContextManager, Dict, List, Optional, Union # from Bio.SeqFeature import SeqFeature, SimpleLocation import pytest from pytest import param, raises -# from ensembl.io.genomio.gff3.exceptions import GFFParserError +from ensembl.io.genomio.gff3.exceptions import GFFParserError from ensembl.io.genomio.gff3.simplifier import GFFSimplifier @@ -31,6 +31,9 @@ "in_gff, expected_gff, expectation", [ param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), + param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), + param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), + param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), ], ) def test_simpler_gff3( @@ -49,3 +52,33 @@ def test_simpler_gff3( simp.simpler_gff3(input_gff) simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) + + +@pytest.mark.parametrize( + "genome_file, in_gff, expected_gff, expectation", + [ + param(None, "genes_badnames.gff", "genes_badnames_noname.gff", does_not_raise(), id="Genes with bad names, no genome"), + param("genome_no_brc4.json", "genes_badnames.gff", "genes_badnames_noname.gff", does_not_raise(), id="Genes with bad names, genome not BRC4"), + param("genome_brc4.json", "genes_badnames.gff", "genes_badnames_brc4name.gff", does_not_raise(), id="Genes with bad names, genome BRC4"), + ], +) +def test_gffsimplifier_with_genome( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + genome_file: Optional[PathLike], + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff + with expectation: + if genome_file is None: + simp = GFFSimplifier() + else: + simp = GFFSimplifier(genome_path=data_dir / genome_file) + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) diff --git a/src/python/tests/gff3/test_simplifier/bad_gene_type.gff b/src/python/tests/gff3/test_simplifier/bad_gene_type.gff new file mode 100644 index 000000000..652047cef --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_gene_type.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff b/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff new file mode 100644 index 000000000..1c478ca1a --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source str_lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/bad_tr_type.gff b/src/python/tests/gff3/test_simplifier/bad_tr_type.gff new file mode 100644 index 000000000..c5f39772c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_tr_type.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source tr_lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames.gff b/src/python/tests/gff3/test_simplifier/genes_badnames.gff new file mode 100644 index 000000000..6c717d3a9 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=L1 +scaffold1 Source mRNA 1 1000 . - . ID=L1t;Parent=L1 +scaffold1 Source exon 1 1000 . - . ID=L1t1;Parent=L1t +scaffold1 Source CDS 100 900 . - 0 ID=L1tc;Parent=L1t diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff b/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff new file mode 100644 index 000000000..1f4a37b05 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_pfal3D7_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_pfal3D7_1_t1;Parent=TMP_pfal3D7_1 +scaffold1 Source exon 1 1000 . - . ID=TMP_pfal3D7_1_t1-E1;Parent=TMP_pfal3D7_1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=TMP_pfal3D7_1_t1_cds;Parent=TMP_pfal3D7_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff b/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff new file mode 100644 index 000000000..655fb9b15 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source exon 1 1000 . - . ID=TMP_PREFIX_1_t1-E1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=TMP_PREFIX_1_t1_cds;Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/genome_brc4.json b/src/python/tests/gff3/test_simplifier/genome_brc4.json new file mode 100644 index 000000000..77d2f895d --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genome_brc4.json @@ -0,0 +1,20 @@ +{ + "BRC4": { + "component": "PlasmoDB", + "organism_abbrev": "pfal3D7" + }, + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file diff --git a/src/python/tests/gff3/test_simplifier/genome_no_brc4.json b/src/python/tests/gff3/test_simplifier/genome_no_brc4.json new file mode 100644 index 000000000..1cf896c30 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genome_no_brc4.json @@ -0,0 +1,16 @@ +{ + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file From d1685046fc7ea4b791ec5d3202283ccebe51126c Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 11:40:11 +0000 Subject: [PATCH 09/72] Add test for skip_unrecognized option --- src/python/tests/gff3/test_simplifier.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 21c4996e9..7614e4b4b 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -54,6 +54,35 @@ def test_simpler_gff3( assert_files(output_gff, data_dir / expected_gff) +@pytest.mark.parametrize( + "in_gff, expected_gff, skip_unrecognized, expectation", + [ + param("bad_gene_type.gff", "", None, raises(GFFParserError), id="Unset skip unrecognized, fail"), + param("bad_gene_type.gff", "", True, raises(GFFParserError), id="True skip unrecognized, fail"), + param("bad_gene_type.gff", "bad_gene_type.gff", False, does_not_raise(), id="bad type, Keep"), + param("ok_genes.gff", "ok_genes_simped.gff", False, does_not_raise(), id="ok type, Keep"), + ], +) +def test_simpler_gff3_skip( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + skip_unrecognized: Optional[bool], + expectation: ContextManager, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff + with expectation: + simp = GFFSimplifier() + if skip_unrecognized is not None: + simp.skip_unrecognized = skip_unrecognized + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + @pytest.mark.parametrize( "genome_file, in_gff, expected_gff, expectation", [ From b20f3eb3671d35ec6fd7f1db2cb578ddc194ca76 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:38:57 +0000 Subject: [PATCH 10/72] Die if non-gene non supported --- src/python/ensembl/io/genomio/gff3/simplifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index da989fc7c..e1f39daad 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -149,7 +149,8 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: if feat.type in ("mobile_genetic_element", "transposable_element"): feat = self.format_mobile_element(feat) return feat - return None + else: + GFFParserError(f"Allowed non-genes but not supported: {feat.type} for {feat.id}") # From here we expect only genes gene = feat From 4c6f09fa89a0cc4b9991b0305aaa9b31a0fbc477 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:39:18 +0000 Subject: [PATCH 11/72] Fix: mobile element clean to get source --- src/python/ensembl/io/genomio/gff3/simplifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index e1f39daad..02389a900 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -260,7 +260,7 @@ def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: # Generate ID if needed and add it to the functional annotation feat.id = self.stable_ids.normalize_gene_id(feat) self.annotations.add_feature(feat, "transposable_element") - feat.qualifiers = {"ID": feat.id} + feat = self.clean_gene(feat) return feat From d446e614aaa35b64a631ccef08abb773403c8a96 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:39:36 +0000 Subject: [PATCH 12/72] Add gff3_feature tests --- src/python/tests/gff3/test_simplifier.py | 34 +++++++++++++++++++ .../gff3/test_simplifier/gene_ignored.gff | 3 ++ .../tests/gff3/test_simplifier/mobile_te.gff | 3 ++ 3 files changed, 40 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/gene_ignored.gff create mode 100644 src/python/tests/gff3/test_simplifier/mobile_te.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 7614e4b4b..a3f6f9f68 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -27,6 +27,40 @@ from ensembl.io.genomio.gff3.simplifier import GFFSimplifier +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), + param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), + param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), + ] +) +def test_simpler_gff3_feature( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: Optional[PathLike], + expectation: ContextManager, +) -> None: + """Test simplifying one gene (from a GFF3 file).""" + input_gff = data_dir / in_gff + with expectation: + simp = GFFSimplifier() + simp.records.from_gff(input_gff) + # Get the only feature + feat = simp.records[0].features[0] + feat = simp.simpler_gff3_feature(feat) + if expected_gff is None: + assert feat == None + else: + output_gff = tmp_dir / in_gff + # Put it back + simp.records[0].features = [feat] + simp.records.to_gff(output_gff) + assert_files(output_gff, Path(data_dir / expected_gff)) + + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/gene_ignored.gff b/src/python/tests/gff3/test_simplifier/gene_ignored.gff new file mode 100644 index 000000000..f1d3e2cff --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/gene_ignored.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source region 1 1000 . - . ID=1 diff --git a/src/python/tests/gff3/test_simplifier/mobile_te.gff b/src/python/tests/gff3/test_simplifier/mobile_te.gff new file mode 100644 index 000000000..6b9937360 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mobile_te.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREM_IPSUM1 From c20e69a32e016448745463effd0ef0f9e8edb5f0 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:52:07 +0000 Subject: [PATCH 13/72] Ignore notimplemented failsafe --- src/python/ensembl/io/genomio/gff3/simplifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 02389a900..88105b4c6 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -149,8 +149,8 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: if feat.type in ("mobile_genetic_element", "transposable_element"): feat = self.format_mobile_element(feat) return feat - else: - GFFParserError(f"Allowed non-genes but not supported: {feat.type} for {feat.id}") + # This check is a failsafe in case you add supported non-genes + raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover # From here we expect only genes gene = feat From 94a29ac328d7b5c0d9b61ad1eb2a755315068589 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:52:19 +0000 Subject: [PATCH 14/72] format --- src/python/tests/gff3/test_simplifier.py | 29 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index a3f6f9f68..b2beefff4 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -33,7 +33,7 @@ param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), - ] + ], ) def test_simpler_gff3_feature( data_dir: Path, @@ -52,7 +52,7 @@ def test_simpler_gff3_feature( feat = simp.records[0].features[0] feat = simp.simpler_gff3_feature(feat) if expected_gff is None: - assert feat == None + assert feat is None else: output_gff = tmp_dir / in_gff # Put it back @@ -117,12 +117,31 @@ def test_simpler_gff3_skip( simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) + @pytest.mark.parametrize( "genome_file, in_gff, expected_gff, expectation", [ - param(None, "genes_badnames.gff", "genes_badnames_noname.gff", does_not_raise(), id="Genes with bad names, no genome"), - param("genome_no_brc4.json", "genes_badnames.gff", "genes_badnames_noname.gff", does_not_raise(), id="Genes with bad names, genome not BRC4"), - param("genome_brc4.json", "genes_badnames.gff", "genes_badnames_brc4name.gff", does_not_raise(), id="Genes with bad names, genome BRC4"), + param( + None, + "genes_badnames.gff", + "genes_badnames_noname.gff", + does_not_raise(), + id="Genes with bad names, no genome", + ), + param( + "genome_no_brc4.json", + "genes_badnames.gff", + "genes_badnames_noname.gff", + does_not_raise(), + id="Genes with bad names, genome not BRC4", + ), + param( + "genome_brc4.json", + "genes_badnames.gff", + "genes_badnames_brc4name.gff", + does_not_raise(), + id="Genes with bad names, genome BRC4", + ), ], ) def test_gffsimplifier_with_genome( From f9364e7ab0a55ea3fb09895649f629c342b6503f Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 15:54:37 +0000 Subject: [PATCH 15/72] simple rename test gff --- src/python/tests/gff3/test_simplifier.py | 6 +++--- .../gff3/test_simplifier/{ok_genes.gff => ok_gene.gff} | 0 .../{ok_genes_simped.gff => ok_gene_simped.gff} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename src/python/tests/gff3/test_simplifier/{ok_genes.gff => ok_gene.gff} (100%) rename src/python/tests/gff3/test_simplifier/{ok_genes_simped.gff => ok_gene_simped.gff} (100%) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index b2beefff4..32b2817f7 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -30,7 +30,7 @@ @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ - param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), + param("ok_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok gene"), param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), ], @@ -64,7 +64,7 @@ def test_simpler_gff3_feature( @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ - param("ok_genes.gff", "ok_genes_simped.gff", does_not_raise(), id="ok gene"), + param("ok_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok gene"), param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), @@ -94,7 +94,7 @@ def test_simpler_gff3( param("bad_gene_type.gff", "", None, raises(GFFParserError), id="Unset skip unrecognized, fail"), param("bad_gene_type.gff", "", True, raises(GFFParserError), id="True skip unrecognized, fail"), param("bad_gene_type.gff", "bad_gene_type.gff", False, does_not_raise(), id="bad type, Keep"), - param("ok_genes.gff", "ok_genes_simped.gff", False, does_not_raise(), id="ok type, Keep"), + param("ok_gene.gff", "ok_gene_simped.gff", False, does_not_raise(), id="ok type, Keep"), ], ) def test_simpler_gff3_skip( diff --git a/src/python/tests/gff3/test_simplifier/ok_genes.gff b/src/python/tests/gff3/test_simplifier/ok_gene.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/ok_genes.gff rename to src/python/tests/gff3/test_simplifier/ok_gene.gff diff --git a/src/python/tests/gff3/test_simplifier/ok_genes_simped.gff b/src/python/tests/gff3/test_simplifier/ok_gene_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/ok_genes_simped.gff rename to src/python/tests/gff3/test_simplifier/ok_gene_simped.gff From a5621467a1f816b140174656ade59be4e85e44f6 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 16:02:25 +0000 Subject: [PATCH 16/72] Add lone tests --- src/python/tests/gff3/test_simplifier.py | 3 +++ src/python/tests/gff3/test_simplifier/lone_cds.gff | 3 +++ src/python/tests/gff3/test_simplifier/lone_cds_simped.gff | 6 ++++++ src/python/tests/gff3/test_simplifier/lone_transcript.gff | 3 +++ .../tests/gff3/test_simplifier/lone_transcript_simped.gff | 4 ++++ .../tests/gff3/test_simplifier/ok_protein_coding_gene.gff | 6 ++++++ 6 files changed, 25 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/lone_cds.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_cds_simped.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_transcript.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff create mode 100644 src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 32b2817f7..660cbeb3a 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -33,6 +33,9 @@ param("ok_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok gene"), param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), + param("ok_protein_coding_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok protein_coding_gene"), + param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), + param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), ], ) def test_simpler_gff3_feature( diff --git a/src/python/tests/gff3/test_simplifier/lone_cds.gff b/src/python/tests/gff3/test_simplifier/lone_cds.gff new file mode 100644 index 000000000..385f01ecb --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_cds.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff b/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff new file mode 100644 index 000000000..edd076fcc --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source exon 1 1000 . - . ID=TMP_PREFIX_1_t1-E1;Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/lone_transcript.gff b/src/python/tests/gff3/test_simplifier/lone_transcript.gff new file mode 100644 index 000000000..878661029 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_transcript.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transcript 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff b/src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff new file mode 100644 index 000000000..56c6348e7 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source ncRNA_gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff b/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff new file mode 100644 index 000000000..6c1b6d2c3 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source protein_coding_gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 From 7c65ae6faffc914a7584daaae1015c9b8f7d91f6 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 16:40:47 +0000 Subject: [PATCH 17/72] Multiple fixes for simplifier --- .../ensembl/io/genomio/gff3/simplifier.py | 60 +++++++++++-------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 88105b4c6..1301d0e17 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -138,7 +138,6 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: ignored_gene_types = self._biotypes["gene"]["ignored"] allowed_non_gene_types = self._biotypes["non_gene"]["supported"] allowed_gene_types = self._biotypes["gene"]["supported"] - transcript_types = self._biotypes["transcript"]["supported"] # Skip explictly ignored features if feat.type in ignored_gene_types: @@ -159,10 +158,8 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: gene.type = "gene" # Create actual genes from transcripts/CDS top level features - if gene.type in transcript_types: - gene = self.create_gene_for_lone_transcript(gene) - elif gene.type == "CDS": - gene = self.create_gene_for_lone_cds(gene) + gene = self.create_gene_for_lone_transcript(gene) + gene = self.create_gene_for_lone_cds(gene) # What to do with unsupported gene types if gene.type not in allowed_gene_types: @@ -176,7 +173,7 @@ def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: return self.clean_gene(gene) # COMPLETION - def create_gene_for_lone_transcript(self, ncrna: SeqFeature) -> SeqFeature: + def create_gene_for_lone_transcript(self, feat: SeqFeature) -> SeqFeature: """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others Args: @@ -186,42 +183,53 @@ def create_gene_for_lone_transcript(self, ncrna: SeqFeature) -> SeqFeature: The gene that contains the transcript. """ + transcript_types = self._biotypes["transcript"]["supported"] + if feat.type not in transcript_types: + return feat + new_type = "ncRNA_gene" - if ncrna.type in ("tRNA", "rRNA"): + if feat.type in ("tRNA", "rRNA"): new_type = "gene" - logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature") - gene = SeqFeature(ncrna.location, type=new_type) - gene.qualifiers["source"] = ncrna.qualifiers["source"] - gene.sub_features = [ncrna] - gene.id = ncrna.id + logging.debug(f"Put the transcript {feat.type} in a {new_type} parent feature") + new_gene = SeqFeature(feat.location, type=new_type) + new_gene.qualifiers["source"] = feat.qualifiers["source"] + new_gene.sub_features = [feat] - return gene + # Use the transcript ID for the gene, and generate a sub ID for the transcript + new_gene.id = feat.id + new_gene.qualifiers["ID"] = new_gene.id + feat.id = self.stable_ids.generate_transcript_id(new_gene.id, 1) + feat.qualifiers["ID"] = feat.id + + return new_gene - def create_gene_for_lone_cds(self, cds: SeqFeature) -> SeqFeature: + def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: """Returns a gene created for a lone CDS.""" + if feat.type != "CDS": + return feat - logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}") + logging.debug(f"Put the lone CDS in gene-mRNA parent features for {feat.id}") # Create a transcript, add the CDS - transcript = SeqFeature(cds.location, type="mRNA") - transcript.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features = [cds] + transcript = SeqFeature(feat.location, type="mRNA") + transcript.qualifiers["source"] = feat.qualifiers["source"] + transcript.sub_features = [feat] # Add an exon too - exon = SeqFeature(cds.location, type="exon") - exon.qualifiers["source"] = cds.qualifiers["source"] + exon = SeqFeature(feat.location, type="exon") + exon.qualifiers["source"] = feat.qualifiers["source"] transcript.sub_features.append(exon) # Create a gene, add the transcript gene_type = "gene" - if ("pseudo" in cds.qualifiers) and (cds.qualifiers["pseudo"][0] == "true"): + if ("pseudo" in feat.qualifiers) and (feat.qualifiers["pseudo"][0] == "true"): gene_type = "pseudogene" - gene = SeqFeature(cds.location, type=gene_type) - gene.qualifiers["source"] = cds.qualifiers["source"] - gene.sub_features = [transcript] - gene.id = self.stable_ids.generate_gene_id() + new_gene = SeqFeature(feat.location, type=gene_type) + new_gene.qualifiers["source"] = feat.qualifiers["source"] + new_gene.sub_features = [transcript] + new_gene.id = self.stable_ids.generate_gene_id() - return gene + return new_gene def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: """Given a mobile_genetic_element feature, transform it into a transposable_element""" From 3e2cda9050d2c1df8759ff3e6b2d5726f426bd96 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 16:41:08 +0000 Subject: [PATCH 18/72] One method to check 1 feature --- src/python/tests/gff3/test_simplifier.py | 46 +++++++++++++++++-- .../tests/gff3/test_simplifier/ok_gene.gff | 2 +- .../gff3/test_simplifier/ok_gene_simped.gff | 6 --- 3 files changed, 42 insertions(+), 12 deletions(-) delete mode 100644 src/python/tests/gff3/test_simplifier/ok_gene_simped.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 660cbeb3a..7453adf6a 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -19,7 +19,7 @@ from pathlib import Path from typing import Any, Callable, ContextManager, Dict, List, Optional, Union -# from Bio.SeqFeature import SeqFeature, SimpleLocation +from Bio.SeqFeature import SeqFeature import pytest from pytest import param, raises @@ -27,13 +27,48 @@ from ensembl.io.genomio.gff3.simplifier import GFFSimplifier +def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> None: + simp = GFFSimplifier() + simp.records.from_gff(input_gff) + # Get the only feature + feat = simp.records[0].features[0] + # Apply the named function + check_method = simp.__getattribute__(check_function) + new_feat = check_method(feat) + # Put it back + simp.records[0].features = [new_feat] + simp.records.to_gff(output_gff) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), + ], +) +def test_create_gene_for_lone_transcript( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: Optional[PathLike], + expectation: ContextManager, +) -> None: + """Test gene create gene for lone transcript.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff + with expectation: + check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") + assert_files(output_gff, Path(data_dir / expected_gff)) + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ - param("ok_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok gene"), + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), - param("ok_protein_coding_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok protein_coding_gene"), + param("ok_protein_coding_gene.gff", "ok_gene.gff", does_not_raise(), id="ok protein_coding_gene"), param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), ], @@ -49,6 +84,7 @@ def test_simpler_gff3_feature( """Test simplifying one gene (from a GFF3 file).""" input_gff = data_dir / in_gff with expectation: + # check_one_feature(input_gff, output_gff, "simpler_gff3_feature") simp = GFFSimplifier() simp.records.from_gff(input_gff) # Get the only feature @@ -67,7 +103,7 @@ def test_simpler_gff3_feature( @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ - param("ok_gene.gff", "ok_gene_simped.gff", does_not_raise(), id="ok gene"), + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), @@ -97,7 +133,7 @@ def test_simpler_gff3( param("bad_gene_type.gff", "", None, raises(GFFParserError), id="Unset skip unrecognized, fail"), param("bad_gene_type.gff", "", True, raises(GFFParserError), id="True skip unrecognized, fail"), param("bad_gene_type.gff", "bad_gene_type.gff", False, does_not_raise(), id="bad type, Keep"), - param("ok_gene.gff", "ok_gene_simped.gff", False, does_not_raise(), id="ok type, Keep"), + param("ok_gene.gff", "ok_gene.gff", False, does_not_raise(), id="ok type, Keep"), ], ) def test_simpler_gff3_skip( diff --git a/src/python/tests/gff3/test_simplifier/ok_gene.gff b/src/python/tests/gff3/test_simplifier/ok_gene.gff index 571f16d05..943179766 100644 --- a/src/python/tests/gff3/test_simplifier/ok_gene.gff +++ b/src/python/tests/gff3/test_simplifier/ok_gene.gff @@ -2,5 +2,5 @@ ##sequence-region scaffold1 1 1000 scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 -scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/ok_gene_simped.gff b/src/python/tests/gff3/test_simplifier/ok_gene_simped.gff deleted file mode 100644 index 943179766..000000000 --- a/src/python/tests/gff3/test_simplifier/ok_gene_simped.gff +++ /dev/null @@ -1,6 +0,0 @@ -##gff-version 3 -##sequence-region scaffold1 1 1000 -scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 -scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 -scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 -scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 From ea71007bc408ed8c8444458431dfdfcf008b106b Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 16:45:24 +0000 Subject: [PATCH 19/72] Use same check method --- src/python/tests/gff3/test_simplifier.py | 40 ++++++++++-------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 7453adf6a..752770954 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -27,17 +27,20 @@ from ensembl.io.genomio.gff3.simplifier import GFFSimplifier -def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> None: - simp = GFFSimplifier() - simp.records.from_gff(input_gff) - # Get the only feature - feat = simp.records[0].features[0] - # Apply the named function - check_method = simp.__getattribute__(check_function) - new_feat = check_method(feat) +def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> SeqFeature: + simp = GFFSimplifier() + simp.records.from_gff(input_gff) + # Get the only feature + feat = simp.records[0].features[0] + # Apply the named function + check_method = simp.__getattribute__(check_function) + new_feat = check_method(feat) + + if new_feat is not None: # Put it back simp.records[0].features = [new_feat] simp.records.to_gff(output_gff) + return new_feat @pytest.mark.parametrize( @@ -59,8 +62,9 @@ def test_create_gene_for_lone_transcript( input_gff = data_dir / in_gff output_gff = tmp_dir / in_gff with expectation: - check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") - assert_files(output_gff, Path(data_dir / expected_gff)) + new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( "in_gff, expected_gff, expectation", @@ -83,20 +87,10 @@ def test_simpler_gff3_feature( ) -> None: """Test simplifying one gene (from a GFF3 file).""" input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff with expectation: - # check_one_feature(input_gff, output_gff, "simpler_gff3_feature") - simp = GFFSimplifier() - simp.records.from_gff(input_gff) - # Get the only feature - feat = simp.records[0].features[0] - feat = simp.simpler_gff3_feature(feat) - if expected_gff is None: - assert feat is None - else: - output_gff = tmp_dir / in_gff - # Put it back - simp.records[0].features = [feat] - simp.records.to_gff(output_gff) + new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") + if new_feat: assert_files(output_gff, Path(data_dir / expected_gff)) From ae533175dc9e17915617a704a6d7c08bf57a4554 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 16:50:51 +0000 Subject: [PATCH 20/72] Fixes --- src/python/tests/gff3/test_simplifier.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 752770954..c0e82bfab 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -28,12 +28,13 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> SeqFeature: + """Load 1 feature from a GFF, apply a function, then write it back to a GFF.""" simp = GFFSimplifier() simp.records.from_gff(input_gff) # Get the only feature feat = simp.records[0].features[0] # Apply the named function - check_method = simp.__getattribute__(check_function) + check_method = getattr(simp, check_function) new_feat = check_method(feat) if new_feat is not None: @@ -55,7 +56,7 @@ def test_create_gene_for_lone_transcript( tmp_dir: Path, assert_files: Callable, in_gff: PathLike, - expected_gff: Optional[PathLike], + expected_gff: PathLike, expectation: ContextManager, ) -> None: """Test gene create gene for lone transcript.""" @@ -66,6 +67,7 @@ def test_create_gene_for_lone_transcript( if new_feat: assert_files(output_gff, Path(data_dir / expected_gff)) + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ @@ -90,7 +92,9 @@ def test_simpler_gff3_feature( output_gff = tmp_dir / in_gff with expectation: new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") - if new_feat: + if expected_gff is None: + assert new_feat is None + else: assert_files(output_gff, Path(data_dir / expected_gff)) From 4b00565288c88145f664b29ddff5869a1c141947 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 17:03:59 +0000 Subject: [PATCH 21/72] Test lone CDS, fix make basic IDs --- .../ensembl/io/genomio/gff3/simplifier.py | 3 +++ src/python/tests/gff3/test_simplifier.py | 26 +++++++++++++++++++ .../gff3/test_simplifier/lone_cds_simped.gff | 2 +- .../tests/gff3/test_simplifier/lone_rrna.gff | 3 +++ .../gff3/test_simplifier/lone_rrna_simped.gff | 4 +++ .../tests/gff3/test_simplifier/lone_trna.gff | 3 +++ .../gff3/test_simplifier/lone_trna_simped.gff | 4 +++ 7 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/python/tests/gff3/test_simplifier/lone_rrna.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_trna.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_trna_simped.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 1301d0e17..5745fd2f9 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -228,6 +228,9 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: new_gene.qualifiers["source"] = feat.qualifiers["source"] new_gene.sub_features = [transcript] new_gene.id = self.stable_ids.generate_gene_id() + new_gene.qualifiers["ID"] = new_gene.id + transcript.id = self.stable_ids.generate_transcript_id(new_gene.id, 1) + transcript.qualifiers["ID"] = transcript.id return new_gene diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index c0e82bfab..1a0e3fd6b 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -49,6 +49,8 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: [ param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), + param("lone_trna.gff", "lone_trna_simped.gff", does_not_raise(), id="lone tRNA"), + param("lone_rrna.gff", "lone_rrna_simped.gff", does_not_raise(), id="lone rRNA"), ], ) def test_create_gene_for_lone_transcript( @@ -68,6 +70,30 @@ def test_create_gene_for_lone_transcript( assert_files(output_gff, Path(data_dir / expected_gff)) +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), + ], +) +def test_create_gene_for_lone_cds( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test gene create gene for lone CDS.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / in_gff + with expectation: + new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) + + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff b/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff index edd076fcc..30514b4af 100644 --- a/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff +++ b/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff @@ -3,4 +3,4 @@ scaffold1 Source gene 1 1000 . - . ID=TMP_PREFIX_1 scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;Parent=TMP_PREFIX_1_t1 -scaffold1 Source exon 1 1000 . - . ID=TMP_PREFIX_1_t1-E1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source exon 1 1000 . - . Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/lone_rrna.gff b/src/python/tests/gff3/test_simplifier/lone_rrna.gff new file mode 100644 index 000000000..d0f856236 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_rrna.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source rRNA 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff b/src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff new file mode 100644 index 000000000..1179c2235 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source rRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone_trna.gff b/src/python/tests/gff3/test_simplifier/lone_trna.gff new file mode 100644 index 000000000..72f74fb1c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_trna.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source tRNA 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone_trna_simped.gff b/src/python/tests/gff3/test_simplifier/lone_trna_simped.gff new file mode 100644 index 000000000..0ed4ebca4 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_trna_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source tRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 From 2322d2ac606e3c11506c9ca0ded0a8059a2432ee Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 13 Mar 2024 17:55:48 +0000 Subject: [PATCH 22/72] Remove lone* from main gff3 --- src/python/tests/gff3/test_simplifier.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 1a0e3fd6b..aec454578 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -101,8 +101,6 @@ def test_create_gene_for_lone_cds( param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), param("ok_protein_coding_gene.gff", "ok_gene.gff", does_not_raise(), id="ok protein_coding_gene"), - param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), - param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), ], ) def test_simpler_gff3_feature( From f3b22eb9af47b5db74c18e1508f46ad1d782a126 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 14 Mar 2024 13:42:26 +0000 Subject: [PATCH 23/72] support CDS pseudo --- src/python/ensembl/io/genomio/gff3/simplifier.py | 1 + src/python/tests/gff3/test_simplifier.py | 1 + src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff | 3 +++ .../tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff | 6 ++++++ 4 files changed, 11 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff create mode 100644 src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 5745fd2f9..270bb37f5 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -224,6 +224,7 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: gene_type = "gene" if ("pseudo" in feat.qualifiers) and (feat.qualifiers["pseudo"][0] == "true"): gene_type = "pseudogene" + del feat.qualifiers["pseudo"] new_gene = SeqFeature(feat.location, type=gene_type) new_gene.qualifiers["source"] = feat.qualifiers["source"] new_gene.sub_features = [transcript] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index aec454578..1c0628867 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -75,6 +75,7 @@ def test_create_gene_for_lone_transcript( [ param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), + param("lone_cds_pseudo.gff", "lone_cds_pseudo_simped.gff", does_not_raise(), id="lone pseudo CDS"), ], ) def test_create_gene_for_lone_cds( diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff b/src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff new file mode 100644 index 000000000..c87e997c8 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;pseudo=true diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff b/src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff new file mode 100644 index 000000000..1d4d03466 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source exon 1 1000 . - . Parent=TMP_PREFIX_1_t1 From bd3a4f1da9e46346ddeb119a90715476a44f434f Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 14 Mar 2024 16:29:18 +0000 Subject: [PATCH 24/72] Rearrange mobile element code --- .../ensembl/io/genomio/gff3/simplifier.py | 110 +++++++++--------- 1 file changed, 52 insertions(+), 58 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 270bb37f5..44c697f5d 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -128,46 +128,34 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: if not self.skip_unrecognized: raise GFFParserError("Unrecognized types found, abort") - def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: + def simpler_gff3_feature(self, gene: SeqFeature) -> Optional[SeqFeature]: """Creates a simpler version of a GFF3 feature. If the feature is invalid/skippable, returns None. """ - - ignored_gene_types = self._biotypes["gene"]["ignored"] - allowed_non_gene_types = self._biotypes["non_gene"]["supported"] - allowed_gene_types = self._biotypes["gene"]["supported"] - - # Skip explictly ignored features - if feat.type in ignored_gene_types: + # Special cases + non_gene = self.normalize_non_gene(gene) + if non_gene: + return non_gene + if gene.type in self._biotypes["gene"]["ignored"]: return None - # Special processing of non-gene features - if feat.type in allowed_non_gene_types: - if feat.type in ("mobile_genetic_element", "transposable_element"): - feat = self.format_mobile_element(feat) - return feat - # This check is a failsafe in case you add supported non-genes - raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover - - # From here we expect only genes - gene = feat - + # Synonym if gene.type == "protein_coding_gene": gene.type = "gene" - # Create actual genes from transcripts/CDS top level features + # Lone sub-gene features, create a gene gene = self.create_gene_for_lone_transcript(gene) gene = self.create_gene_for_lone_cds(gene) # What to do with unsupported gene types - if gene.type not in allowed_gene_types: + if gene.type not in self._biotypes["gene"]["supported"]: self.fail_types.add(f"gene={gene.type}") logging.debug(f"Unsupported gene type: {gene.type} (for {gene.id})") return None - # Normalize, store annotation, and return the cleaned up gene + # Normalize and store gene = self.normalize_gene(gene) self.annotations.store_gene(gene) return self.clean_gene(gene) @@ -235,46 +223,52 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: return new_gene - def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: - """Given a mobile_genetic_element feature, transform it into a transposable_element""" - - # Change mobile_genetic_element into a transposable_element feature - if feat.type == "mobile_genetic_element": - mobile_element_type = feat.qualifiers.get("mobile_element_type", []) - if mobile_element_type: - # Get the type (and name) from the attrib - if ":" in mobile_element_type[0]: - element_type, element_name = mobile_element_type[0].split(":") - description = f"{element_type} ({element_name})" - else: - element_type = mobile_element_type[0] - description = element_type - - # Keep the metadata in the description if the type is known - if element_type in ("transposon", "retrotransposon"): - feat.type = "transposable_element" - if not feat.qualifiers.get("product"): - feat.qualifiers["product"] = [description] - else: - logging.warning( - f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}" - ) - return feat - else: - logging.warning("Mobile genetic element does not have a 'mobile_element_type' tag") - return feat - elif feat.type == "transposable_element": - pass + def normalize_non_gene(self, feat: SeqFeature) -> SeqFeature: + """Special case for non-genes (currently transposable elements only). + Returns None if not applicable + """ + + if feat.type not in self._biotypes["non_gene"]["supported"]: + return + if feat.type in ("mobile_genetic_element", "transposable_element"): + feat = self._normalize_mobile_genetic_element(feat) + # Generate ID if needed + feat.id = self.stable_ids.normalize_gene_id(feat) + feat.qualifiers["ID"] = feat.id + + self.annotations.add_feature(feat, "transposable_element") + return self.clean_gene(feat) else: - logging.warning(f"Feature {feat.id} is not a supported TE feature {feat.type}") + # This check is a failsafe in case you add supported non-genes + raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover + + def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: + """Normalize a mobile element if it has a mobile_element_type field.""" + + if feat.type != "mobile_genetic_element": return feat - # Generate ID if needed and add it to the functional annotation - feat.id = self.stable_ids.normalize_gene_id(feat) - self.annotations.add_feature(feat, "transposable_element") - feat = self.clean_gene(feat) + try: + mobile_element_type = feat.qualifiers["mobile_element_type"] + except KeyError: + logging.warning("No 'mobile_element_type' tag found") + return feat + + # Get the type (and name) from the attrib + if ":" in mobile_element_type[0]: + element_type, element_name = mobile_element_type[0].split(":") + description = f"{element_type} ({element_name})" + else: + description = mobile_element_type[0] - return feat + # Keep the metadata in the description if the type is known + if element_type in ("transposon", "retrotransposon"): + feat.type = "transposable_element" + if not feat.qualifiers.get("product"): + feat.qualifiers["product"] = [description] + return feat + else: + raise GFFParserError(f"'mobile_element_type' is not a transposon: {element_type}") # GENES def clean_gene(self, gene: SeqFeature) -> SeqFeature: From 761d2225edc596e8123a42cb88687f4830cff51f Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 14 Mar 2024 17:11:28 +0000 Subject: [PATCH 25/72] Add mobile tests + fixes --- .../ensembl/io/genomio/gff3/simplifier.py | 11 +++--- src/python/tests/gff3/test_simplifier.py | 36 ++++++++++++++++++- .../test_simplifier/non_gene/mobile_1name.gff | 3 ++ .../non_gene/mobile_1name_simped.gff3 | 3 ++ .../test_simplifier/non_gene/mobile_full.gff | 3 ++ .../non_gene/mobile_full_simped.gff | 3 ++ .../gff3/test_simplifier/non_gene/te.gff | 3 ++ 7 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff create mode 100644 src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 create mode 100644 src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff create mode 100644 src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff create mode 100644 src/python/tests/gff3/test_simplifier/non_gene/te.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 44c697f5d..626987016 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -223,7 +223,7 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: return new_gene - def normalize_non_gene(self, feat: SeqFeature) -> SeqFeature: + def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: """Special case for non-genes (currently transposable elements only). Returns None if not applicable """ @@ -231,6 +231,7 @@ def normalize_non_gene(self, feat: SeqFeature) -> SeqFeature: if feat.type not in self._biotypes["non_gene"]["supported"]: return if feat.type in ("mobile_genetic_element", "transposable_element"): + feat.type = "transposable_element" feat = self._normalize_mobile_genetic_element(feat) # Generate ID if needed feat.id = self.stable_ids.normalize_gene_id(feat) @@ -244,10 +245,6 @@ def normalize_non_gene(self, feat: SeqFeature) -> SeqFeature: def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: """Normalize a mobile element if it has a mobile_element_type field.""" - - if feat.type != "mobile_genetic_element": - return feat - try: mobile_element_type = feat.qualifiers["mobile_element_type"] except KeyError: @@ -259,11 +256,11 @@ def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: element_type, element_name = mobile_element_type[0].split(":") description = f"{element_type} ({element_name})" else: - description = mobile_element_type[0] + element_type = mobile_element_type[0] + description = element_type # Keep the metadata in the description if the type is known if element_type in ("transposon", "retrotransposon"): - feat.type = "transposable_element" if not feat.qualifiers.get("product"): feat.qualifiers["product"] = [description] return feat diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 1c0628867..2a1448968 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Any, Callable, ContextManager, Dict, List, Optional, Union +from typing import Callable, ContextManager, Dict, Optional from Bio.SeqFeature import SeqFeature import pytest @@ -95,6 +95,40 @@ def test_create_gene_for_lone_cds( assert_files(output_gff, Path(data_dir / expected_gff)) +@pytest.mark.parametrize( + "in_type, in_mobile_type, out_type, out_description, expectation", + [ + param("gene", None, "gene", None, does_not_raise(), id="Gene, skip"), + param("transposable_element", None, "transposable_element", None, does_not_raise(), id="TE"), + param("mobile_genetic_element", None, "transposable_element", None, does_not_raise(), id="MGE"), + param("transposable_element", "transposon", "transposable_element", "transposon", does_not_raise(), id="MGE, transposon"), + param("transposable_element", "transposon:LOREM", "transposable_element", "transposon (LOREM)", does_not_raise(), id="MGE, transposon named"), + param("transposable_element", "retrotransposon:LOREM", "transposable_element", "retrotransposon (LOREM)", does_not_raise(), id="MGE, retrotransposon named"), + param("transposable_element", "UNKNOWNtransposon:LOREM", "transposable_element", None, raises(GFFParserError), id="MGE, unknown type"), + ], +) +def test_normalize_non_gene( + in_type: str, + in_mobile_type: Optional[str], + out_type: str, + out_description: Optional[str], + expectation: ContextManager, +) -> None: + """Test gene create gene for lone CDS.""" + simp = GFFSimplifier() + feat = SeqFeature(None, in_type) + feat.qualifiers = {"source": "LOREM"} + if in_mobile_type is not None: + feat.qualifiers["mobile_element_type"] = [in_mobile_type] + feat.sub_features = [] + with expectation: + new_feat = simp.normalize_non_gene(feat) + if new_feat is not None: + assert new_feat.type == out_type + if out_description is not None: + assert new_feat.qualifiers == feat.qualifiers + + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff new file mode 100644 index 000000000..495ad3731 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source mobile_genetic_element 1 1000 . - . ID=LOREMIPSUM1;gbkey=mobile_element;mobile_element_type=transposon diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff new file mode 100644 index 000000000..65952b0a0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source mobile_genetic_element 1 1000 . - . ID=LOREMIPSUM1;gbkey=mobile_element;mobile_element_type=transposon:rnd-1_family-319/TvMULE1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/te.gff b/src/python/tests/gff3/test_simplifier/non_gene/te.gff new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/te.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 From 2eda3641a50bc6d2f13fb92a034bc250a4a90160 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 14 Mar 2024 17:15:29 +0000 Subject: [PATCH 26/72] Test product if any --- src/python/tests/gff3/test_simplifier.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 2a1448968..71b29c8b6 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -96,20 +96,22 @@ def test_create_gene_for_lone_cds( @pytest.mark.parametrize( - "in_type, in_mobile_type, out_type, out_description, expectation", + "in_type, in_mobile_type, in_product, out_type, out_description, expectation", [ - param("gene", None, "gene", None, does_not_raise(), id="Gene, skip"), - param("transposable_element", None, "transposable_element", None, does_not_raise(), id="TE"), - param("mobile_genetic_element", None, "transposable_element", None, does_not_raise(), id="MGE"), - param("transposable_element", "transposon", "transposable_element", "transposon", does_not_raise(), id="MGE, transposon"), - param("transposable_element", "transposon:LOREM", "transposable_element", "transposon (LOREM)", does_not_raise(), id="MGE, transposon named"), - param("transposable_element", "retrotransposon:LOREM", "transposable_element", "retrotransposon (LOREM)", does_not_raise(), id="MGE, retrotransposon named"), - param("transposable_element", "UNKNOWNtransposon:LOREM", "transposable_element", None, raises(GFFParserError), id="MGE, unknown type"), + param("gene", None, None, "gene", None, does_not_raise(), id="Gene, skip"), + param("transposable_element", None, None, "transposable_element", None, does_not_raise(), id="TE"), + param("mobile_genetic_element", None, None, "transposable_element", None, does_not_raise(), id="MGE"), + param("transposable_element", "transposon", None, "transposable_element", "transposon", does_not_raise(), id="MGE, transposon"), + param("transposable_element", "transposon:LOREM", None, "transposable_element", "transposon (LOREM)", does_not_raise(), id="MGE, transposon named"), + param("transposable_element", "retrotransposon:LOREM", None, "transposable_element", "retrotransposon (LOREM)", does_not_raise(), id="MGE, retrotransposon named"), + param("transposable_element", "UNKNOWNtransposon:LOREM", None, "transposable_element", None, raises(GFFParserError), id="MGE, unknown type"), + param("transposable_element", "transposon", "PROD", "transposable_element", "PROD", does_not_raise(), id="MGE, transposon, product exists"), ], ) def test_normalize_non_gene( in_type: str, in_mobile_type: Optional[str], + in_product: Optional[str], out_type: str, out_description: Optional[str], expectation: ContextManager, @@ -120,6 +122,8 @@ def test_normalize_non_gene( feat.qualifiers = {"source": "LOREM"} if in_mobile_type is not None: feat.qualifiers["mobile_element_type"] = [in_mobile_type] + if in_product is not None: + feat.qualifiers["product"] = [in_product] feat.sub_features = [] with expectation: new_feat = simp.normalize_non_gene(feat) From 5b9b6b0a54f14e9478761785737f451174817500 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 14 Mar 2024 17:18:59 +0000 Subject: [PATCH 27/72] Fixes --- .../ensembl/io/genomio/gff3/simplifier.py | 12 ++--- src/python/tests/gff3/test_simplifier.py | 52 ++++++++++++++++--- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 626987016..ca38bf834 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -229,7 +229,7 @@ def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: """ if feat.type not in self._biotypes["non_gene"]["supported"]: - return + return None if feat.type in ("mobile_genetic_element", "transposable_element"): feat.type = "transposable_element" feat = self._normalize_mobile_genetic_element(feat) @@ -239,9 +239,8 @@ def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: self.annotations.add_feature(feat, "transposable_element") return self.clean_gene(feat) - else: - # This check is a failsafe in case you add supported non-genes - raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover + # This is a failsafe in case you add supported non-genes + raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: """Normalize a mobile element if it has a mobile_element_type field.""" @@ -250,7 +249,7 @@ def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: except KeyError: logging.warning("No 'mobile_element_type' tag found") return feat - + # Get the type (and name) from the attrib if ":" in mobile_element_type[0]: element_type, element_name = mobile_element_type[0].split(":") @@ -264,8 +263,7 @@ def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: if not feat.qualifiers.get("product"): feat.qualifiers["product"] = [description] return feat - else: - raise GFFParserError(f"'mobile_element_type' is not a transposon: {element_type}") + raise GFFParserError(f"'mobile_element_type' is not a transposon: {element_type}") # GENES def clean_gene(self, gene: SeqFeature) -> SeqFeature: diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 71b29c8b6..65bfed99f 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Callable, ContextManager, Dict, Optional +from typing import Callable, ContextManager, Optional from Bio.SeqFeature import SeqFeature import pytest @@ -101,11 +101,51 @@ def test_create_gene_for_lone_cds( param("gene", None, None, "gene", None, does_not_raise(), id="Gene, skip"), param("transposable_element", None, None, "transposable_element", None, does_not_raise(), id="TE"), param("mobile_genetic_element", None, None, "transposable_element", None, does_not_raise(), id="MGE"), - param("transposable_element", "transposon", None, "transposable_element", "transposon", does_not_raise(), id="MGE, transposon"), - param("transposable_element", "transposon:LOREM", None, "transposable_element", "transposon (LOREM)", does_not_raise(), id="MGE, transposon named"), - param("transposable_element", "retrotransposon:LOREM", None, "transposable_element", "retrotransposon (LOREM)", does_not_raise(), id="MGE, retrotransposon named"), - param("transposable_element", "UNKNOWNtransposon:LOREM", None, "transposable_element", None, raises(GFFParserError), id="MGE, unknown type"), - param("transposable_element", "transposon", "PROD", "transposable_element", "PROD", does_not_raise(), id="MGE, transposon, product exists"), + param( + "transposable_element", + "transposon", + None, + "transposable_element", + "transposon", + does_not_raise(), + id="MGE, transposon", + ), + param( + "transposable_element", + "transposon:LOREM", + None, + "transposable_element", + "transposon (LOREM)", + does_not_raise(), + id="MGE, transposon named", + ), + param( + "transposable_element", + "retrotransposon:LOREM", + None, + "transposable_element", + "retrotransposon (LOREM)", + does_not_raise(), + id="MGE, retrotransposon named", + ), + param( + "transposable_element", + "UNKNOWNtransposon:LOREM", + None, + "transposable_element", + None, + raises(GFFParserError), + id="MGE, unknown type", + ), + param( + "transposable_element", + "transposon", + "PROD", + "transposable_element", + "PROD", + does_not_raise(), + id="MGE, transposon, product exists", + ), ], ) def test_normalize_non_gene( From e12d32845fcc5972775af043dad8522afbbc1bbc Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 10:48:18 +0000 Subject: [PATCH 28/72] Move test files in subfolders --- src/python/tests/gff3/test_simplifier.py | 14 +++++++------- .../test_simplifier/{lone_cds.gff => lone/cds.gff} | 0 .../{lone_cds_pseudo.gff => lone/cds_pseudo.gff} | 0 .../cds_pseudo_simped.gff} | 0 .../{lone_cds_simped.gff => lone/cds_simped.gff} | 0 .../{lone_rrna.gff => lone/rrna.gff} | 0 .../{lone_rrna_simped.gff => lone/rrna_simped.gff} | 0 .../{lone_transcript.gff => lone/transcript.gff} | 0 .../transcript_simped.gff} | 0 .../{lone_trna.gff => lone/trna.gff} | 0 .../{lone_trna_simped.gff => lone/trna_simped.gff} | 0 11 files changed, 7 insertions(+), 7 deletions(-) rename src/python/tests/gff3/test_simplifier/{lone_cds.gff => lone/cds.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_cds_pseudo.gff => lone/cds_pseudo.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_cds_pseudo_simped.gff => lone/cds_pseudo_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_cds_simped.gff => lone/cds_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_rrna.gff => lone/rrna.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_rrna_simped.gff => lone/rrna_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_transcript.gff => lone/transcript.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_transcript_simped.gff => lone/transcript_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_trna.gff => lone/trna.gff} (100%) rename src/python/tests/gff3/test_simplifier/{lone_trna_simped.gff => lone/trna_simped.gff} (100%) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 65bfed99f..3d48ef8b3 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -48,9 +48,9 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: "in_gff, expected_gff, expectation", [ param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("lone_transcript.gff", "lone_transcript_simped.gff", does_not_raise(), id="lone transcript"), - param("lone_trna.gff", "lone_trna_simped.gff", does_not_raise(), id="lone tRNA"), - param("lone_rrna.gff", "lone_rrna_simped.gff", does_not_raise(), id="lone rRNA"), + param("lone/transcript.gff", "lone/transcript_simped.gff", does_not_raise(), id="lone transcript"), + param("lone/trna.gff", "lone/trna_simped.gff", does_not_raise(), id="lone tRNA"), + param("lone/rrna.gff", "lone/rrna_simped.gff", does_not_raise(), id="lone rRNA"), ], ) def test_create_gene_for_lone_transcript( @@ -63,7 +63,7 @@ def test_create_gene_for_lone_transcript( ) -> None: """Test gene create gene for lone transcript.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name with expectation: new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") if new_feat: @@ -74,8 +74,8 @@ def test_create_gene_for_lone_transcript( "in_gff, expected_gff, expectation", [ param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("lone_cds.gff", "lone_cds_simped.gff", does_not_raise(), id="lone CDS"), - param("lone_cds_pseudo.gff", "lone_cds_pseudo_simped.gff", does_not_raise(), id="lone pseudo CDS"), + param("lone/cds.gff", "lone/cds_simped.gff", does_not_raise(), id="lone CDS"), + param("lone/cds_pseudo.gff", "lone/cds_pseudo_simped.gff", does_not_raise(), id="lone pseudo CDS"), ], ) def test_create_gene_for_lone_cds( @@ -88,7 +88,7 @@ def test_create_gene_for_lone_cds( ) -> None: """Test gene create gene for lone CDS.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name with expectation: new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") if new_feat: diff --git a/src/python/tests/gff3/test_simplifier/lone_cds.gff b/src/python/tests/gff3/test_simplifier/lone/cds.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_cds.gff rename to src/python/tests/gff3/test_simplifier/lone/cds.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_cds_pseudo.gff rename to src/python/tests/gff3/test_simplifier/lone/cds_pseudo.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_cds_pseudo_simped.gff rename to src/python/tests/gff3/test_simplifier/lone/cds_pseudo_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_cds_simped.gff b/src/python/tests/gff3/test_simplifier/lone/cds_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_cds_simped.gff rename to src/python/tests/gff3/test_simplifier/lone/cds_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_rrna.gff b/src/python/tests/gff3/test_simplifier/lone/rrna.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_rrna.gff rename to src/python/tests/gff3/test_simplifier/lone/rrna.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff b/src/python/tests/gff3/test_simplifier/lone/rrna_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_rrna_simped.gff rename to src/python/tests/gff3/test_simplifier/lone/rrna_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_transcript.gff b/src/python/tests/gff3/test_simplifier/lone/transcript.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_transcript.gff rename to src/python/tests/gff3/test_simplifier/lone/transcript.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff b/src/python/tests/gff3/test_simplifier/lone/transcript_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_transcript_simped.gff rename to src/python/tests/gff3/test_simplifier/lone/transcript_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_trna.gff b/src/python/tests/gff3/test_simplifier/lone/trna.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_trna.gff rename to src/python/tests/gff3/test_simplifier/lone/trna.gff diff --git a/src/python/tests/gff3/test_simplifier/lone_trna_simped.gff b/src/python/tests/gff3/test_simplifier/lone/trna_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/lone_trna_simped.gff rename to src/python/tests/gff3/test_simplifier/lone/trna_simped.gff From 4f6d7ee43d03e7473e70618646fb34d70054ca66 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 10:48:32 +0000 Subject: [PATCH 29/72] Show file name in diff --- src/python/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index b6021a119..00d452ad4 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -76,7 +76,7 @@ def _assert_files(result_path: Path, expected_path: Path) -> None: results = result_fh.readlines() with open(expected_path, "r") as expected_fh: expected = expected_fh.readlines() - files_diff = list(unified_diff(results, expected, fromfile="Test-made file", tofile="Expected file")) + files_diff = list(unified_diff(results, expected, fromfile=f"Test-made file {result_path.name}", tofile=f"Expected file {expected_path.name}")) assert_message = f"Test-made and expected files differ\n{' '.join(files_diff)}" assert len(files_diff) == 0, assert_message From 75d4429c8e64e7eb312e3709955c362ca5a287be Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 10:48:45 +0000 Subject: [PATCH 30/72] Add clean_gene tests --- src/python/tests/gff3/test_simplifier.py | 24 +++++++++++++++++++ .../gff3/test_simplifier/clean/extra.gff | 6 +++++ .../test_simplifier/clean/extra_clean.gff | 6 +++++ 3 files changed, 36 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/clean/extra.gff create mode 100644 src/python/tests/gff3/test_simplifier/clean/extra_clean.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 3d48ef8b3..735e17aee 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -173,6 +173,30 @@ def test_normalize_non_gene( assert new_feat.qualifiers == feat.qualifiers +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("clean/extra.gff", "clean/extra_clean.gff", does_not_raise(), id="ok gene with extra attribs"), + ], +) +def test_clean_gene( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test clean gene.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name + with expectation: + new_feat = check_one_feature(input_gff, output_gff, "clean_gene") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) + + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/clean/extra.gff b/src/python/tests/gff3/test_simplifier/clean/extra.gff new file mode 100644 index 000000000..9d82141f8 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/clean/extra.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1;extra_field=Lorem; +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;extra_field=Lorem;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . extra_field=Lorem;ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;extra_field=Lorem;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff b/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff new file mode 100644 index 000000000..943179766 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 From 6edc29e364cc88eb1b6ec1c6d3f765dfd5c5e123 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 10:59:04 +0000 Subject: [PATCH 31/72] remove checks: would fail anyway --- src/python/ensembl/io/genomio/gff3/simplifier.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index ca38bf834..5dc820fe0 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -270,19 +270,15 @@ def clean_gene(self, gene: SeqFeature) -> SeqFeature: """Return the same gene without qualifiers unrelated to the gene structure.""" old_gene_qualifiers = gene.qualifiers - try: - gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} - except KeyError as err: - raise KeyError(f"Missing source for {gene.id}") from err + gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} for transcript in gene.sub_features: # Replace qualifiers old_transcript_qualifiers = transcript.qualifiers transcript.qualifiers = { "ID": transcript.id, "Parent": gene.id, + "source": old_transcript_qualifiers["source"] } - if "source" in old_transcript_qualifiers: - transcript.qualifiers["source"] = old_transcript_qualifiers["source"] for feat in transcript.sub_features: old_qualifiers = feat.qualifiers From d4d23d5e7dfca26c044930bc0915db0f93b2ecc8 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 11:02:36 +0000 Subject: [PATCH 32/72] remove phase check: invalid gff3 anyway --- src/python/ensembl/io/genomio/gff3/simplifier.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 5dc820fe0..0359b0ad6 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -288,12 +288,7 @@ def clean_gene(self, gene: SeqFeature) -> SeqFeature: "source": old_qualifiers["source"], } if feat.type == "CDS": - try: - feat.qualifiers["phase"] = old_qualifiers["phase"] - except KeyError as err: - raise KeyError( - f"Missing phase for gene {gene.type} {gene.id}, CDS {feat.id} ({old_qualifiers})" - ) from err + feat.qualifiers["phase"] = old_qualifiers["phase"] return gene From e11633ebd1a9809214bef125114f5599cf350ef9 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 12:48:19 +0000 Subject: [PATCH 33/72] Test gene segments --- .../ensembl/io/genomio/gff3/simplifier.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 0359b0ad6..8e5da70f2 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -385,20 +385,21 @@ def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: transcript: Gene segment transcript feature. """ + if transcript.type not in ("C_gene_segment", "V_gene_segment"): + return transcript + # Change V/C_gene_segment into a its corresponding transcript names - if transcript.type in ("C_gene_segment", "V_gene_segment"): + try: standard_name = transcript.qualifiers["standard_name"][0] - biotype = transcript.type.replace("_segment", "") - if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): - biotype = f"IG_{biotype}" - elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): - biotype = f"TR_{biotype}" - else: - logging.warning( - f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}" - ) - return transcript - transcript.type = biotype + except KeyError as err: + raise GFFParserError(f"No standard_name for {transcript.type}") from err + biotype = transcript.type.replace("_segment", "") + if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): + transcript.type = f"IG_{biotype}" + elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): + transcript.type = f"TR_{biotype}" + else: + raise GFFParserError(f"Unexpected 'standard_name' for {transcript.id}: {standard_name}") return transcript def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFeature) -> SeqFeature: From a113f7cbb6b019b3c8cd4c462be6c1922b753b02 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 13:14:49 +0000 Subject: [PATCH 34/72] Segments tests --- src/python/tests/gff3/test_simplifier.py | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 735e17aee..ae2de142c 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Callable, ContextManager, Optional +from typing import Callable, ContextManager, List, Optional from Bio.SeqFeature import SeqFeature import pytest @@ -173,6 +173,35 @@ def test_normalize_non_gene( assert new_feat.qualifiers == feat.qualifiers +@pytest.mark.parametrize( + "in_type, name, out_type, expectation", + [ + param("mRNA", "", "mRNA", does_not_raise(), id="mRNA no change"), + param("C_gene_segment", "", "C_gene_segment", raises(GFFParserError), id="no standard name"), + param("C_gene_segment", "immunoglobulin", "IG_C_gene", does_not_raise(), id="C immunoglobulin"), + param("C_gene_segment", "ig", "IG_C_gene", does_not_raise(), id="C ig"), + param("V_gene_segment", "t-cell", "TR_V_gene", does_not_raise(), id="V t-cell"), + param("V_gene_segment", "T_cell", "TR_V_gene", does_not_raise(), id="V T_cell"), + param("V_gene_segment", "Lorem Ipsum", "", raises(GFFParserError), id="V T_cell"), + ], +) +def test_format_gene_segments( + in_type: str, + name: Optional[List[str]], + out_type: str, + expectation: ContextManager, +) -> None: + """Test gene create gene for lone CDS.""" + simp = GFFSimplifier() + feat = SeqFeature(None, in_type) + if name: + feat.qualifiers["standard_name"] = [name] + feat.sub_features = [] + with expectation: + new_feat = simp.format_gene_segments(feat) + assert new_feat.type == out_type + + @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ From f06d18e526489670a93aa391922acd1c82e4b7ba Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 15:06:28 +0000 Subject: [PATCH 35/72] Move remove_cds to restructure + tests --- .../ensembl/io/genomio/gff3/restructure.py | 26 ++++++++++++++++ .../ensembl/io/genomio/gff3/simplifier.py | 30 ++----------------- src/python/tests/gff3/test_restructure.py | 29 ++++++++++++++++++ 3 files changed, 58 insertions(+), 27 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/restructure.py b/src/python/ensembl/io/genomio/gff3/restructure.py index 56531b146..9c03e61df 100644 --- a/src/python/ensembl/io/genomio/gff3/restructure.py +++ b/src/python/ensembl/io/genomio/gff3/restructure.py @@ -22,6 +22,7 @@ "move_only_exons_to_new_mrna", "move_cds_to_existing_mrna", "remove_extra_exons", + "remove_cds_from_pseudogene", ] from collections import Counter @@ -264,3 +265,28 @@ def remove_extra_exons(gene: SeqFeature) -> None: gene.sub_features += others else: raise GFFParserError(f"Can't remove extra exons for {gene.id}, not all start with 'id-'") + + +def remove_cds_from_pseudogene(gene: SeqFeature) -> None: + """Removes the CDS from a pseudogene. + + This assumes the CDSs are sub features of the transcript or the gene. + + """ + if gene.type != "pseudogene": + return + + gene_subfeats = [] + for transcript in gene.sub_features: + if transcript.type == "CDS": + logging.debug(f"Remove pseudo CDS {transcript.id}") + continue + new_subfeats = [] + for feat in transcript.sub_features: + if feat.type == "CDS": + logging.debug(f"Remove pseudo CDS {feat.id}") + continue + new_subfeats.append(feat) + transcript.sub_features = new_subfeats + gene_subfeats.append(transcript) + gene.sub_features = gene_subfeats diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 8e5da70f2..354e02146 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -37,7 +37,7 @@ from ensembl.io.genomio.utils.json_utils import get_json from .extract_annotation import FunctionalAnnotations from .id_allocator import StableIDAllocator -from .restructure import restructure_gene +from .restructure import restructure_gene, remove_cds_from_pseudogene from .exceptions import GFFParserError @@ -277,7 +277,7 @@ def clean_gene(self, gene: SeqFeature) -> SeqFeature: transcript.qualifiers = { "ID": transcript.id, "Parent": gene.id, - "source": old_transcript_qualifiers["source"] + "source": old_transcript_qualifiers["source"], } for feat in transcript.sub_features: @@ -317,31 +317,7 @@ def normalize_pseudogene(self, gene: SeqFeature) -> None: if self.allow_pseudogene_with_cds: self.stable_ids.normalize_pseudogene_cds_id(gene) else: - self.remove_cds_from_pseudogene(gene) - - def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None: - """Removes the CDS from a pseudogene. - - This assumes the CDSs are sub features of the transcript or the gene. - - """ - if gene.type != "pseudogene": - return - - gene_subfeats = [] - for transcript in gene.sub_features: - if transcript.type == "CDS": - logging.debug(f"Remove pseudo CDS {transcript.id}") - continue - new_subfeats = [] - for feat in transcript.sub_features: - if feat.type == "CDS": - logging.debug(f"Remove pseudo CDS {feat.id}") - continue - new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) - gene.sub_features = gene_subfeats + remove_cds_from_pseudogene(gene) # TRANSCRIPTS def normalize_transcripts(self, gene: SeqFeature) -> None: diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index b3d1779c1..a42278568 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -334,3 +334,32 @@ def test_restructure_gene( with expectation: restructure.restructure_gene(gene) assert gen.get_sub_structure(gene) == {"gene": expected_children} + + +@pytest.mark.parametrize( + "children, expected_children", + [ + param("gene", "gene", id="gene"), + param("pseudogene", "pseudogene", id="pseudogene"), + param({"pseudogene": ["mRNA"]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA"), + param( + {"pseudogene": [{"mRNA": ["CDS", "CDS"]}]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA CDSs" + ), + param( + {"pseudogene": [{"mRNA": ["CDS", "exon"]}]}, + {"pseudogene": [{"mRNA": ["exon"]}]}, + id="pseudogene mRNA CDSs, exons", + ), + param({"pseudogene": ["CDS", "CDS"]}, "pseudogene", id="pseudogene CDSs"), + ], +) +def test_remove_cds_from_pseudogene( + children: List[Any], + expected_children: List[Any], +) -> None: + """Test CDS removal from pseudogene.""" + gen = FeatGenerator() + gene = gen.make_structure([children])[0] + # raise Exception(gene) + restructure.remove_cds_from_pseudogene(gene) + assert gen.get_sub_structure(gene) == expected_children From 98a9efccd104ca5d8071f3178a2f68c5d84686a5 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 15 Mar 2024 15:29:37 +0000 Subject: [PATCH 36/72] Remove unneeded check --- src/python/ensembl/io/genomio/gff3/simplifier.py | 3 +-- src/python/tests/conftest.py | 9 ++++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 354e02146..643c57534 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -391,8 +391,7 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea # Replace qualifiers old_exon_qualifiers = feat.qualifiers feat.qualifiers = {"Parent": transcript.id} - if "source" in old_exon_qualifiers: - feat.qualifiers["source"] = old_exon_qualifiers["source"] + feat.qualifiers["source"] = old_exon_qualifiers["source"] elif feat.type == "CDS": # New CDS ID feat.id = self.stable_ids.normalize_cds_id(feat.id) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index 00d452ad4..4ec58cfb4 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -76,7 +76,14 @@ def _assert_files(result_path: Path, expected_path: Path) -> None: results = result_fh.readlines() with open(expected_path, "r") as expected_fh: expected = expected_fh.readlines() - files_diff = list(unified_diff(results, expected, fromfile=f"Test-made file {result_path.name}", tofile=f"Expected file {expected_path.name}")) + files_diff = list( + unified_diff( + results, + expected, + fromfile=f"Test-made file {result_path.name}", + tofile=f"Expected file {expected_path.name}", + ) + ) assert_message = f"Test-made and expected files differ\n{' '.join(files_diff)}" assert len(files_diff) == 0, assert_message From ce28207bbce0e1aa2c8c2010c920e37c9f2cd2ff Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Thu, 21 Mar 2024 16:52:47 +0000 Subject: [PATCH 37/72] Change default skip, update skip tests --- src/python/ensembl/io/genomio/gff3/simplifier.py | 4 ++-- src/python/tests/gff3/test_simplifier.py | 12 +++++++++--- .../gff3/test_simplifier/bad_gene_type_skipped.gff | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 643c57534..fea39e2e1 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -73,7 +73,7 @@ class GFFSimplifier: def __init__( self, genome_path: Optional[PathLike] = None, - skip_unrecognized: Optional[bool] = True, + skip_unrecognized: Optional[bool] = False, allow_pseudogene_with_cds: Optional[bool] = False, ): """Create an object that simplifies `SeqFeature` objects. @@ -123,7 +123,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: record.features = cleaned_features if self.fail_types: - fail_errors = "\n ".join(self.fail_types.keys()) + fail_errors = "\n ".join(list(self.fail_types)) logging.warning(f"Unrecognized types found:\n {fail_errors}") if not self.skip_unrecognized: raise GFFParserError("Unrecognized types found, abort") diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index ae2de142c..17e399793 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -285,9 +285,15 @@ def test_simpler_gff3( "in_gff, expected_gff, skip_unrecognized, expectation", [ param("bad_gene_type.gff", "", None, raises(GFFParserError), id="Unset skip unrecognized, fail"), - param("bad_gene_type.gff", "", True, raises(GFFParserError), id="True skip unrecognized, fail"), - param("bad_gene_type.gff", "bad_gene_type.gff", False, does_not_raise(), id="bad type, Keep"), - param("ok_gene.gff", "ok_gene.gff", False, does_not_raise(), id="ok type, Keep"), + param( + "bad_gene_type.gff", + "bad_gene_type_skipped.gff", + True, + does_not_raise(), + id="True skip unrecognized, no fail", + ), + param("bad_gene_type.gff", "", False, raises(GFFParserError), id="bad type, fail"), + param("ok_gene.gff", "ok_gene.gff", False, does_not_raise(), id="ok type, fail"), ], ) def test_simpler_gff3_skip( diff --git a/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff b/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff new file mode 100644 index 000000000..710a1cea7 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff @@ -0,0 +1,2 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 From 3fef8448ef062577eed2c2d11000c71e95e5da20 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 17:47:21 +0000 Subject: [PATCH 38/72] Apply suggestions from code review Co-authored-by: J. Alvarez-Jarreta --- .../ensembl/io/genomio/gff3/restructure.py | 20 +++++++++---------- src/python/tests/gff3/test_records.py | 2 +- src/python/tests/gff3/test_restructure.py | 1 - src/python/tests/gff3/test_simplifier.py | 14 ++++++------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/restructure.py b/src/python/ensembl/io/genomio/gff3/restructure.py index 9c03e61df..2e9a6afc4 100644 --- a/src/python/ensembl/io/genomio/gff3/restructure.py +++ b/src/python/ensembl/io/genomio/gff3/restructure.py @@ -268,7 +268,7 @@ def remove_extra_exons(gene: SeqFeature) -> None: def remove_cds_from_pseudogene(gene: SeqFeature) -> None: - """Removes the CDS from a pseudogene. + """Removes the CDSs from a pseudogene. This assumes the CDSs are sub features of the transcript or the gene. @@ -280,13 +280,13 @@ def remove_cds_from_pseudogene(gene: SeqFeature) -> None: for transcript in gene.sub_features: if transcript.type == "CDS": logging.debug(f"Remove pseudo CDS {transcript.id}") - continue - new_subfeats = [] - for feat in transcript.sub_features: - if feat.type == "CDS": - logging.debug(f"Remove pseudo CDS {feat.id}") - continue - new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) + else: + new_subfeats = [] + for feat in transcript.sub_features: + if feat.type == "CDS": + logging.debug(f"Remove pseudo CDS {feat.id}") + else: + new_subfeats.append(feat) + transcript.sub_features = new_subfeats + gene_subfeats.append(transcript) gene.sub_features = gene_subfeats diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 6de6d0683..10960ef3e 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Unit testing of `ensembl.io.genomio.gff3.simplifier` Record object.""" +"""Unit testing of `ensembl.io.genomio.gff3.simplifier.Records` class.""" from os import PathLike from pathlib import Path diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index a42278568..06ea0e3b9 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -360,6 +360,5 @@ def test_remove_cds_from_pseudogene( """Test CDS removal from pseudogene.""" gen = FeatGenerator() gene = gen.make_structure([children])[0] - # raise Exception(gene) restructure.remove_cds_from_pseudogene(gene) assert gen.get_sub_structure(gene) == expected_children diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 17e399793..cb14a3379 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -54,8 +54,8 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: ], ) def test_create_gene_for_lone_transcript( + tmp_path: Path, data_dir: Path, - tmp_dir: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, @@ -63,7 +63,7 @@ def test_create_gene_for_lone_transcript( ) -> None: """Test gene create gene for lone transcript.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name with expectation: new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") if new_feat: @@ -187,7 +187,7 @@ def test_normalize_non_gene( ) def test_format_gene_segments( in_type: str, - name: Optional[List[str]], + name: str, out_type: str, expectation: ContextManager, ) -> None: @@ -284,7 +284,7 @@ def test_simpler_gff3( @pytest.mark.parametrize( "in_gff, expected_gff, skip_unrecognized, expectation", [ - param("bad_gene_type.gff", "", None, raises(GFFParserError), id="Unset skip unrecognized, fail"), + param("bad_gene_type.gff", "", False, raises(GFFParserError), id="Unset skip unrecognized, fail"), param( "bad_gene_type.gff", "bad_gene_type_skipped.gff", @@ -302,16 +302,14 @@ def test_simpler_gff3_skip( assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, - skip_unrecognized: Optional[bool], + skip_unrecognized: bool, expectation: ContextManager, ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff output_gff = tmp_dir / in_gff with expectation: - simp = GFFSimplifier() - if skip_unrecognized is not None: - simp.skip_unrecognized = skip_unrecognized + simp = GFFSimplifier(skip_unrecognized=skip_unrecognized) simp.simpler_gff3(input_gff) simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) From 699a179310ab3783ccbc9d13fc563cf63b49c037 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 17:50:01 +0000 Subject: [PATCH 39/72] Apply suggestions from code review Co-authored-by: J. Alvarez-Jarreta --- .../ensembl/io/genomio/gff3/simplifier.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 5836dbfcd..2030de15a 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -45,7 +45,12 @@ class Records(list): """List of GFF3 SeqRecords.""" def from_gff(self, in_gff_path: PathLike, excluded: Optional[List[str]] = None) -> None: - """Load records from a GFF3 file.""" + """Loads records from a GFF3 file. + + Args: + in_gff_path: Input GFF3 file path. + excluded: Record IDs to not load from the GFF3 file. + """ if excluded is None: excluded = [] with Path(in_gff_path).open("r") as in_gff_fh: @@ -163,16 +168,11 @@ def simpler_gff3_feature(self, gene: SeqFeature) -> Optional[SeqFeature]: self.annotations.store_gene(gene) return self.clean_gene(gene) - # COMPLETION def create_gene_for_lone_transcript(self, feat: SeqFeature) -> SeqFeature: - """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others + """Returns a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others. Args: - ncrna: the transcript for which we want to create a gene. - - Returns: - The gene that contains the transcript. - + ncrna: The transcript for which we want to create a gene. """ transcript_types = self._biotypes["transcript"]["supported"] if feat.type not in transcript_types: @@ -227,8 +227,15 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: return new_gene def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: - """Special case for non-genes (currently transposable elements only). - Returns None if not applicable + """Returns a normalised "non-gene" or `None` if not applicable. + + Only transposable elements supported at the moment. + + Args: + feat: Feature to normalise. + + Raises: + NotImplementedError: If the feature is a not supported non-gene. """ if feat.type not in self._biotypes["non_gene"]["supported"]: @@ -254,12 +261,10 @@ def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: return feat # Get the type (and name) from the attrib - if ":" in mobile_element_type[0]: - element_type, element_name = mobile_element_type[0].split(":") - description = f"{element_type} ({element_name})" - else: - element_type = mobile_element_type[0] - description = element_type + element_type, _, element_name = mobile_element_type[0].partition(":") + description = element_type + if element_name: + description += f" ({element_name})" # Keep the metadata in the description if the type is known if element_type in ("transposon", "retrotransposon"): @@ -268,7 +273,6 @@ def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: return feat raise GFFParserError(f"'mobile_element_type' is not a transposon: {element_type}") - # GENES def clean_gene(self, gene: SeqFeature) -> SeqFeature: """Return the same gene without qualifiers unrelated to the gene structure.""" @@ -311,7 +315,6 @@ def normalize_gene(self, gene: SeqFeature) -> SeqFeature: return gene - # PSEUDOGENES def normalize_pseudogene(self, gene: SeqFeature) -> None: """Normalize CDSs if allowed, otherwise remove them.""" if gene.type != "pseudogene": @@ -322,7 +325,6 @@ def normalize_pseudogene(self, gene: SeqFeature) -> None: else: remove_cds_from_pseudogene(gene) - # TRANSCRIPTS def normalize_transcripts(self, gene: SeqFeature) -> None: """Normalizes a transcript.""" @@ -363,6 +365,8 @@ def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: Args: transcript: Gene segment transcript feature. + Raises: + GFFParserError: Missing or unexpected transcript's standard name. """ if transcript.type not in ("C_gene_segment", "V_gene_segment"): return transcript From d55f451cd6ad0d44f59f187d3b4520a1d2840fad Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 18:05:15 +0000 Subject: [PATCH 40/72] Remove duplicated code, moved to restructure --- .../ensembl/io/genomio/gff3/simplifier.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 2030de15a..3d2bf68ac 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -470,30 +470,6 @@ def cds_gene(self, cds: SeqFeature) -> SeqFeature: return gene - def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None: - """Removes the CDS from a pseudogene. - - This assumes the CDSs are sub features of the transcript or the gene. - - """ - if gene.type != "pseudogene": - return - - gene_subfeats = [] - for transcript in gene.sub_features: - if transcript.type == "CDS": - logging.debug(f"Remove pseudo CDS {transcript.id}") - continue - new_subfeats = [] - for feat in transcript.sub_features: - if feat.type == "CDS": - logging.debug(f"Remove pseudo CDS {feat.id}") - continue - new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) - gene.sub_features = gene_subfeats - def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: """Returns gene representations from a miRNA gene that can be loaded in an Ensembl database. From ce132bcf74e92650413a3ac37cdb05122b21807c Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 18:05:28 +0000 Subject: [PATCH 41/72] Fix indentation --- src/python/ensembl/io/genomio/gff3/restructure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/restructure.py b/src/python/ensembl/io/genomio/gff3/restructure.py index 2e9a6afc4..a4c77df71 100644 --- a/src/python/ensembl/io/genomio/gff3/restructure.py +++ b/src/python/ensembl/io/genomio/gff3/restructure.py @@ -287,6 +287,6 @@ def remove_cds_from_pseudogene(gene: SeqFeature) -> None: logging.debug(f"Remove pseudo CDS {feat.id}") else: new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) + transcript.sub_features = new_subfeats + gene_subfeats.append(transcript) gene.sub_features = gene_subfeats From 193539ab2d99431e78410ec03e6a4f76a29b2d46 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 18:06:45 +0000 Subject: [PATCH 42/72] format --- src/python/ensembl/io/genomio/gff3/simplifier.py | 6 +++--- src/python/tests/gff3/test_simplifier.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 3d2bf68ac..6384c7bcb 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -228,12 +228,12 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: """Returns a normalised "non-gene" or `None` if not applicable. - + Only transposable elements supported at the moment. - + Args: feat: Feature to normalise. - + Raises: NotImplementedError: If the feature is a not supported non-gene. """ diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index cb14a3379..0dc467485 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Callable, ContextManager, List, Optional +from typing import Callable, ContextManager, Optional from Bio.SeqFeature import SeqFeature import pytest From 72413e2e78a5357aaa4424bbd2452c2357a434bd Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 18:16:37 +0000 Subject: [PATCH 43/72] Remove unnecessary expectations --- src/python/tests/gff3/test_simplifier.py | 91 ++++++++++-------------- 1 file changed, 39 insertions(+), 52 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 0dc467485..a0fc94214 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -45,12 +45,12 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: @pytest.mark.parametrize( - "in_gff, expected_gff, expectation", + "in_gff, expected_gff", [ - param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("lone/transcript.gff", "lone/transcript_simped.gff", does_not_raise(), id="lone transcript"), - param("lone/trna.gff", "lone/trna_simped.gff", does_not_raise(), id="lone tRNA"), - param("lone/rrna.gff", "lone/rrna_simped.gff", does_not_raise(), id="lone rRNA"), + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("lone/transcript.gff", "lone/transcript_simped.gff", id="lone transcript"), + param("lone/trna.gff", "lone/trna_simped.gff", id="lone tRNA"), + param("lone/rrna.gff", "lone/rrna_simped.gff", id="lone rRNA"), ], ) def test_create_gene_for_lone_transcript( @@ -59,23 +59,21 @@ def test_create_gene_for_lone_transcript( assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, - expectation: ContextManager, ) -> None: """Test gene create gene for lone transcript.""" input_gff = data_dir / in_gff output_gff = tmp_path / Path(in_gff).name - with expectation: - new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( - "in_gff, expected_gff, expectation", + "in_gff, expected_gff", [ - param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("lone/cds.gff", "lone/cds_simped.gff", does_not_raise(), id="lone CDS"), - param("lone/cds_pseudo.gff", "lone/cds_pseudo_simped.gff", does_not_raise(), id="lone pseudo CDS"), + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("lone/cds.gff", "lone/cds_simped.gff", id="lone CDS"), + param("lone/cds_pseudo.gff", "lone/cds_pseudo_simped.gff", id="lone pseudo CDS"), ], ) def test_create_gene_for_lone_cds( @@ -84,15 +82,13 @@ def test_create_gene_for_lone_cds( assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, - expectation: ContextManager, ) -> None: """Test gene create gene for lone CDS.""" input_gff = data_dir / in_gff output_gff = tmp_dir / Path(in_gff).name - with expectation: - new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( @@ -203,10 +199,10 @@ def test_format_gene_segments( @pytest.mark.parametrize( - "in_gff, expected_gff, expectation", + "in_gff, expected_gff", [ - param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("clean/extra.gff", "clean/extra_clean.gff", does_not_raise(), id="ok gene with extra attribs"), + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("clean/extra.gff", "clean/extra_clean.gff", id="ok gene with extra attribs"), ], ) def test_clean_gene( @@ -215,24 +211,22 @@ def test_clean_gene( assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, - expectation: ContextManager, ) -> None: """Test clean gene.""" input_gff = data_dir / in_gff output_gff = tmp_dir / Path(in_gff).name - with expectation: - new_feat = check_one_feature(input_gff, output_gff, "clean_gene") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + new_feat = check_one_feature(input_gff, output_gff, "clean_gene") + if new_feat: + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( - "in_gff, expected_gff, expectation", + "in_gff, expected_gff", [ - param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), - param("gene_ignored.gff", None, does_not_raise(), id="gene ignored"), - param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), - param("ok_protein_coding_gene.gff", "ok_gene.gff", does_not_raise(), id="ok protein_coding_gene"), + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("gene_ignored.gff", None, id="gene ignored"), + param("mobile_te.gff", "mobile_te.gff", id="TE"), + param("ok_protein_coding_gene.gff", "ok_gene.gff", id="ok protein_coding_gene"), ], ) def test_simpler_gff3_feature( @@ -241,17 +235,15 @@ def test_simpler_gff3_feature( assert_files: Callable, in_gff: PathLike, expected_gff: Optional[PathLike], - expectation: ContextManager, ) -> None: """Test simplifying one gene (from a GFF3 file).""" input_gff = data_dir / in_gff output_gff = tmp_dir / in_gff - with expectation: - new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") - if expected_gff is None: - assert new_feat is None - else: - assert_files(output_gff, Path(data_dir / expected_gff)) + new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") + if expected_gff is None: + assert new_feat is None + else: + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( @@ -316,27 +308,24 @@ def test_simpler_gff3_skip( @pytest.mark.parametrize( - "genome_file, in_gff, expected_gff, expectation", + "genome_file, in_gff, expected_gff", [ param( None, "genes_badnames.gff", "genes_badnames_noname.gff", - does_not_raise(), id="Genes with bad names, no genome", ), param( "genome_no_brc4.json", "genes_badnames.gff", "genes_badnames_noname.gff", - does_not_raise(), id="Genes with bad names, genome not BRC4", ), param( "genome_brc4.json", "genes_badnames.gff", "genes_badnames_brc4name.gff", - does_not_raise(), id="Genes with bad names, genome BRC4", ), ], @@ -348,16 +337,14 @@ def test_gffsimplifier_with_genome( genome_file: Optional[PathLike], in_gff: PathLike, expected_gff: PathLike, - expectation: ContextManager, ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff output_gff = tmp_dir / in_gff - with expectation: - if genome_file is None: - simp = GFFSimplifier() - else: - simp = GFFSimplifier(genome_path=data_dir / genome_file) - simp.simpler_gff3(input_gff) - simp.records.to_gff(output_gff) - assert_files(output_gff, data_dir / expected_gff) + if genome_file is None: + simp = GFFSimplifier() + else: + simp = GFFSimplifier(genome_path=data_dir / genome_file) + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) From 5113884f3d40a5106a606f39ef98e2a36f950d55 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 22 Mar 2024 18:21:03 +0000 Subject: [PATCH 44/72] Exclude non tested things from the expectation --- src/python/tests/gff3/test_simplifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index a0fc94214..24b3386c9 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -300,9 +300,10 @@ def test_simpler_gff3_skip( """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff output_gff = tmp_dir / in_gff + simp = GFFSimplifier(skip_unrecognized=skip_unrecognized) with expectation: - simp = GFFSimplifier(skip_unrecognized=skip_unrecognized) simp.simpler_gff3(input_gff) + if expected_gff: simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) From 540301d9418408113f0e1038a1b7e08befea4340 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Mon, 25 Mar 2024 15:53:40 +0000 Subject: [PATCH 45/72] remove 1 test, not very useful --- src/python/tests/gff3/test_records.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 10960ef3e..432780858 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -27,7 +27,6 @@ @pytest.mark.parametrize( "in_gff, excluded, expected_loaded", [ - param("record_n1.gff", None, ["scaffold1"], id="1 record"), param("record_n2.gff", None, ["scaffold1", "scaffold2"], id="2 records"), param("record_n2.gff", ["scaffold1"], ["scaffold2"], id="2 records, exclude 1"), param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), From 197ab66f15fc3e6cdc7be22016a4b96eb3718c5d Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Mon, 25 Mar 2024 16:19:21 +0000 Subject: [PATCH 46/72] Test not implemented --- src/python/ensembl/io/genomio/gff3/simplifier.py | 2 +- src/python/tests/gff3/test_simplifier.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 6384c7bcb..f8a436f6d 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -250,7 +250,7 @@ def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: self.annotations.add_feature(feat, "transposable_element") return self.clean_gene(feat) # This is a failsafe in case you add supported non-genes - raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") # pragma: no cover + raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: """Normalize a mobile element if it has a mobile_element_type field.""" diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 24b3386c9..27d1d7367 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -152,7 +152,7 @@ def test_normalize_non_gene( out_description: Optional[str], expectation: ContextManager, ) -> None: - """Test gene create gene for lone CDS.""" + """Test non-gene normalization.""" simp = GFFSimplifier() feat = SeqFeature(None, in_type) feat.qualifiers = {"source": "LOREM"} @@ -169,6 +169,15 @@ def test_normalize_non_gene( assert new_feat.qualifiers == feat.qualifiers +def test_normalize_non_gene_not_implemented() -> None: + """Test non-gene not in the biotype list.""" + simp = GFFSimplifier() + simp._biotypes = {"non_gene": {"supported": ["non_gene_name"]}} # pylint: disable=protected-access + feat = SeqFeature(None, "non_gene_name") + with raises(NotImplementedError): + simp.normalize_non_gene(feat) + + @pytest.mark.parametrize( "in_type, name, out_type, expectation", [ From 65bc58e6439e42e800fbd12bee2a08747591af27 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Mon, 25 Mar 2024 16:50:00 +0000 Subject: [PATCH 47/72] Apply suggestions from code review Co-authored-by: J. Alvarez-Jarreta --- src/python/tests/gff3/test_records.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 432780858..1b13f65a4 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -39,10 +39,7 @@ def test_from_gff( input_gff = data_dir / in_gff records = Records() - if excluded is None: - records.from_gff(input_gff) - else: - records.from_gff(input_gff, excluded) + records.from_gff(input_gff, excluded) record_names = [record.id for record in records] assert record_names == expected_loaded From 3dd4babc463767b88fec36eaa2740bdf3edeec0a Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 14:43:55 +0000 Subject: [PATCH 48/72] Update how miRNA are processed + test --- .../ensembl/io/genomio/gff3/simplifier.py | 26 ++++++++----- src/python/tests/gff3/test_simplifier.py | 39 ++++++++++++++++++- .../gff3/test_simplifier/mirna/mirna1.gff | 6 +++ .../test_simplifier/mirna/mirna1_simped.gff | 8 ++++ 4 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna1.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index f8a436f6d..0b9eec47d 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -123,7 +123,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: cleaned_features = [] for feature in record.features: split_genes = self.normalize_mirna(feature) - if split_genes: + if len(split_genes) > 1: cleaned_features += split_genes else: clean_feature = self.simpler_gff3_feature(feature) @@ -480,16 +480,24 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: GFFParserError: If gene has more than 1 transcript, the transcript was not formatted correctly or there are unknown sub-features. """ - - transcript = gene.sub_features - if (len(transcript) == 0) or (transcript[0].type != "primary_transcript"): - return [] - if len(transcript) > 1: - raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") + base_id = gene.id + if gene.type == "primary_transcript": + primary = gene + gene = SeqFeature(primary.location, type="gene") + gene.sub_features = [primary] + gene.qualifiers = primary.qualifiers + gene.id = f"{base_id}_0" + gene.qualifiers["ID"] = gene.id + else: + transcripts = gene.sub_features + if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): + return [gene] + if len(transcripts) > 1: + raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") + primary = transcripts[0] logging.debug(f"Formatting miRNA gene {gene.id}") - primary = transcript[0] new_genes = [] new_primary_subfeatures = [] num = 1 @@ -497,7 +505,7 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: if sub.type == "exon": new_primary_subfeatures.append(sub) elif sub.type == "miRNA": - new_gene_id = f"{gene.id}_{num}" + new_gene_id = f"{base_id}_{num}" num += 1 new_gene = SeqFeature(sub.location, "gene", id=new_gene_id) new_gene.qualifiers = {"source": sub.qualifiers["source"], "ID": new_gene_id} diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 27d1d7367..8b7288396 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -39,7 +39,10 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: if new_feat is not None: # Put it back - simp.records[0].features = [new_feat] + if isinstance(new_feat, list): + simp.records[0].features = new_feat + else: + simp.records[0].features = [new_feat] simp.records.to_gff(output_gff) return new_feat @@ -358,3 +361,37 @@ def test_gffsimplifier_with_genome( simp.simpler_gff3(input_gff) simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) + + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param( + "ok_gene.gff", + "ok_gene.gff", + does_not_raise(), + id="Gene without miRNA", + ), + param( + "mirna/mirna1.gff", + "mirna/mirna1_simped.gff", + does_not_raise(), + id="Gene with 1 miRNA", + ), + ], +) +def test_normalize_mirna( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test normalizing miRNA genes.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name + with expectation: + check_one_feature(input_gff, output_gff, "normalize_mirna") + assert_files(output_gff, Path(data_dir / expected_gff)) diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna1.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna1.gff new file mode 100644 index 000000000..b4bb721c6 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna1.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff new file mode 100644 index 000000000..487283c45 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_1_t1;Parent=LOREMIPSUM1_1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_1_t1-E1;Parent=LOREMIPSUM1_1_t1 From c6043719bcb5e52838eadbe5fc604070d0300472 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 14:57:43 +0000 Subject: [PATCH 49/72] More cases, tweak IDs --- src/python/ensembl/io/genomio/gff3/simplifier.py | 15 ++++++++++----- src/python/tests/gff3/test_simplifier.py | 14 +++++++++++++- .../gff3/test_simplifier/mirna/mirna2_pseudo.gff | 4 ++++ .../mirna/mirna2_pseudo_simped.gff | 5 +++++ .../gff3/test_simplifier/mirna/mirna3_gene.gff | 7 +++++++ .../test_simplifier/mirna/mirna3_gene_simped.gff | 8 ++++++++ 6 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 0b9eec47d..c1d94a7b4 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -481,13 +481,13 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: correctly or there are unknown sub-features. """ base_id = gene.id + + # Insert main gene first if gene.type == "primary_transcript": primary = gene gene = SeqFeature(primary.location, type="gene") gene.sub_features = [primary] gene.qualifiers = primary.qualifiers - gene.id = f"{base_id}_0" - gene.qualifiers["ID"] = gene.id else: transcripts = gene.sub_features if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): @@ -496,6 +496,10 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") primary = transcripts[0] + # Set ID of the top gene + gene.id = f"{base_id}_0" + gene.qualifiers["ID"] = gene.id + logging.debug(f"Formatting miRNA gene {gene.id}") new_genes = [] @@ -516,9 +520,10 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: primary.sub_features = new_primary_subfeatures if not new_genes: - raise GFFParserError(f"Could not parse a primary_transcript for {gene.id}") - - all_genes = [gene] + new_genes + logging.debug(f"Primary_transcript without miRNA in {gene.id}") + all_genes = [gene] + else: + all_genes = [gene] + new_genes # Normalize like other genes all_genes_cleaned = [] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 8b7288396..1f8063480 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -377,7 +377,19 @@ def test_gffsimplifier_with_genome( "mirna/mirna1.gff", "mirna/mirna1_simped.gff", does_not_raise(), - id="Gene with 1 miRNA", + id="Primary_transcript with 1 miRNA", + ), + param( + "mirna/mirna2_pseudo.gff", + "mirna/mirna2_pseudo_simped.gff", + does_not_raise(), + id="Pseudo primary_transcript", + ), + param( + "mirna/mirna3_gene.gff", + "mirna/mirna3_gene_simped.gff", + does_not_raise(), + id="Gene with Primary_transcript with 1 miRNA", ), ], ) diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff new file mode 100644 index 000000000..5a4f0c998 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 \ No newline at end of file diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff new file mode 100644 index 000000000..502aa673c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff new file mode 100644 index 000000000..a447b2c7f --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff @@ -0,0 +1,7 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1p;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1M;Parent=LOREMIPSUM1p +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1p_t1;Parent=LOREMIPSUM1p +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1;Parent=LOREMIPSUM1p_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff new file mode 100644 index 000000000..487283c45 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_1_t1;Parent=LOREMIPSUM1_1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_1_t1-E1;Parent=LOREMIPSUM1_1_t1 From e99cdf7643ad6eb8d519070e292951e2d13a30a3 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 16:20:57 +0000 Subject: [PATCH 50/72] More checks, improve code --- .../ensembl/io/genomio/gff3/simplifier.py | 20 +++++++++++-------- src/python/tests/gff3/test_simplifier.py | 12 +++++++++++ .../mirna/mirna4_unsupported.gff | 6 ++++++ .../mirna/mirna5_unsupported.gff | 5 +++++ 4 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index c1d94a7b4..f8f59aef7 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -481,20 +481,24 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: correctly or there are unknown sub-features. """ base_id = gene.id + transcripts = gene.sub_features - # Insert main gene first + # Insert main gene first if needed + old_gene = gene if gene.type == "primary_transcript": - primary = gene + primary = old_gene gene = SeqFeature(primary.location, type="gene") gene.sub_features = [primary] gene.qualifiers = primary.qualifiers - else: transcripts = gene.sub_features - if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): - return [gene] - if len(transcripts) > 1: - raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") - primary = transcripts[0] + + if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): + return [old_gene] + if len(transcripts) > 1: + raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") + + # Passed the checks + primary = transcripts[0] # Set ID of the top gene gene.id = f"{base_id}_0" diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 1f8063480..69b036766 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -391,6 +391,18 @@ def test_gffsimplifier_with_genome( does_not_raise(), id="Gene with Primary_transcript with 1 miRNA", ), + param( + "mirna/mirna4_unsupported.gff", + "", + raises(GFFParserError, match="Unknown subtype"), + id="Gene with Primary_transcript with mRNA, not supported", + ), + param( + "mirna/mirna5_unsupported.gff", + "", + raises(GFFParserError, match="too many sub_features"), + id="Gene with Primary_transcript with 2 miRNAs, not supported", + ), ], ) def test_normalize_mirna( diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff new file mode 100644 index 000000000..684525919 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff new file mode 100644 index 000000000..63dd90143 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t2;Parent=LOREMIPSUM1 From 617f77e7811ea29ebf7df265e7c7ea01423a9ef5 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 16:28:40 +0000 Subject: [PATCH 51/72] Remove obsolete methods --- .../ensembl/io/genomio/gff3/simplifier.py | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index f8a436f6d..39f9329b4 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -422,53 +422,6 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea transcript.sub_features.pop(elt) return transcript - # COMPLETION - def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature: - """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA' for all others - - Args: - ncrna: the transcript for which we want to create a gene. - - Returns: - The gene that contains the transcript. - - """ - new_type = "ncRNA_gene" - if ncrna.type in ("tRNA", "rRNA"): - new_type = "gene" - logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature") - gene = SeqFeature(ncrna.location, type=new_type) - gene.qualifiers["source"] = ncrna.qualifiers["source"] - gene.sub_features = [ncrna] - gene.id = ncrna.id - - return gene - - def cds_gene(self, cds: SeqFeature) -> SeqFeature: - """Returns a gene created for a lone CDS.""" - - logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}") - - # Create a transcript, add the CDS - transcript = SeqFeature(cds.location, type="mRNA") - transcript.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features = [cds] - - # Add an exon too - exon = SeqFeature(cds.location, type="exon") - exon.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features.append(exon) - - # Create a gene, add the transcript - gene_type = "gene" - if ("pseudo" in cds.qualifiers) and (cds.qualifiers["pseudo"][0] == "true"): - gene_type = "pseudogene" - gene = SeqFeature(cds.location, type=gene_type) - gene.qualifiers["source"] = cds.qualifiers["source"] - gene.sub_features = [transcript] - gene.id = self.stable_ids.generate_gene_id() - - return gene def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: """Returns gene representations from a miRNA gene that can be loaded in an Ensembl database. From 42f1eddd1857837305b5306f719ee200225d1542 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 16:32:06 +0000 Subject: [PATCH 52/72] format --- src/python/ensembl/io/genomio/gff3/simplifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 39f9329b4..1d2335c96 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -422,7 +422,6 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea transcript.sub_features.pop(elt) return transcript - def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: """Returns gene representations from a miRNA gene that can be loaded in an Ensembl database. From 245d91ff20a2281857b98b1f1d999c3f4b8371d4 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 16:49:42 +0000 Subject: [PATCH 53/72] More tests --- src/python/ensembl/io/genomio/gff3/simplifier.py | 5 ++--- src/python/tests/gff3/test_simplifier.py | 2 +- src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff | 8 ++++++++ 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 41ce829d7..a69b74dac 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -387,7 +387,6 @@ def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFeature) -> SeqFeature: """Returns a transcript with normalized sub-features.""" - ignored_transcript_types = self._biotypes["transcript"]["ignored"] exons_to_delete = [] exon_number = 1 for tcount, feat in enumerate(transcript.sub_features): @@ -405,7 +404,7 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea if feat.id in ("", gene.id, transcript.id): feat.id = f"{transcript.id}_cds" else: - if feat.type in ignored_transcript_types: + if feat.type in self._biotypes["transcript"]["ignored"]: exons_to_delete.append(tcount) continue @@ -448,7 +447,7 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: return [old_gene] if len(transcripts) > 1: raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") - + # Passed the checks primary = transcripts[0] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 69b036766..f2770254a 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -239,6 +239,7 @@ def test_clean_gene( param("gene_ignored.gff", None, id="gene ignored"), param("mobile_te.gff", "mobile_te.gff", id="TE"), param("ok_protein_coding_gene.gff", "ok_gene.gff", id="ok protein_coding_gene"), + param("ok_tr_ignored.gff", "ok_gene.gff", id="ok gene with ignored transcripts/subtranscripts"), ], ) def test_simpler_gff3_feature( @@ -363,7 +364,6 @@ def test_gffsimplifier_with_genome( assert_files(output_gff, data_dir / expected_gff) - @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff b/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff new file mode 100644 index 000000000..916081424 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source five_prime_UTR 900 1000 . - 0 ID=5pID;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 +scaffold1 Source three_prime_UTR 900 1000 . - 0 ID=3pID;Parent=LOREMIPSUM1_t1 From cb5221b67fd0f8a6ecd70d5dae05bc44d5a9620b Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Tue, 26 Mar 2024 16:53:37 +0000 Subject: [PATCH 54/72] Test miRNA split support --- src/python/tests/gff3/test_simplifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index f2770254a..254c7acd2 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -266,6 +266,7 @@ def test_simpler_gff3_feature( param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), + param("mirna/mirna1.gff", "mirna/mirna1_simped.gff", does_not_raise(), id="miRNA split"), ], ) def test_simpler_gff3( @@ -278,7 +279,7 @@ def test_simpler_gff3( ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name with expectation: simp = GFFSimplifier() simp.simpler_gff3(input_gff) From 36022b294a992a0ea4e2606c5ac97fc22cb2c5cc Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 11:41:54 +0000 Subject: [PATCH 55/72] Add pseudogene check --- src/python/tests/gff3/test_simplifier.py | 28 +++++++++++++++++++ .../tests/gff3/test_simplifier/pseudogene.gff | 5 ++++ .../gff3/test_simplifier/pseudogene_cds.gff | 6 ++++ .../pseudogene_cds_removed.gff | 5 ++++ 4 files changed, 44 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/pseudogene.gff create mode 100644 src/python/tests/gff3/test_simplifier/pseudogene_cds.gff create mode 100644 src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 254c7acd2..58c32b407 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -287,6 +287,34 @@ def test_simpler_gff3( assert_files(output_gff, data_dir / expected_gff) +@pytest.mark.parametrize( + "in_gff, expected_gff, allow_cds", + [ + param("ok_gene.gff", "ok_gene.gff", False, id="ok gene"), + param("ok_gene.gff", "ok_gene.gff", True, id="ok gene, allow pseudo CDS"), + param("pseudogene.gff", "pseudogene.gff", False, id="ok pseudogene"), + param("pseudogene_cds.gff", "pseudogene_cds_removed.gff", False, id="pseudogene cds removed"), + param("pseudogene_cds.gff", "pseudogene_cds.gff", True, id="pseudogene cds kept"), + ], +) +def test_simpler_gff3_pseudogene( + data_dir: Path, + tmp_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + allow_cds: bool, +) -> None: + """Test simplifying pseudogenes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_dir / Path(in_gff).name + simp = GFFSimplifier() + simp.allow_pseudogene_with_cds = allow_cds + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + + @pytest.mark.parametrize( "in_gff, expected_gff, skip_unrecognized, expectation", [ diff --git a/src/python/tests/gff3/test_simplifier/pseudogene.gff b/src/python/tests/gff3/test_simplifier/pseudogene.gff new file mode 100644 index 000000000..a6d419cb0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff b/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff new file mode 100644 index 000000000..48d5ba95c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff b/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff new file mode 100644 index 000000000..a6d419cb0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 From 412f59b73b714363cd79171194bfd7a9c0b75137 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 14:23:10 +0000 Subject: [PATCH 56/72] Add pseudogene miRNA --- src/python/tests/gff3/test_simplifier.py | 6 ++++++ .../tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff | 5 +++++ .../gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff | 5 +++++ 3 files changed, 16 insertions(+) create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff create mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 58c32b407..95f746f87 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -432,6 +432,12 @@ def test_gffsimplifier_with_genome( raises(GFFParserError, match="too many sub_features"), id="Gene with Primary_transcript with 2 miRNAs, not supported", ), + param( + "mirna/mirna6_pseudogene.gff", + "mirna/mirna6_pseudogene_simped.gff", + does_not_raise(), + id="Gene with Primary_transcript and no miRNA (pseudogene)", + ), ], ) def test_normalize_mirna( diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff new file mode 100644 index 000000000..76e01bb16 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1p;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1M;Parent=LOREMIPSUM1p diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff new file mode 100644 index 000000000..502aa673c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 From 230de4efac4bd45118e9e8e92fcb1239fc4f93c4 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 15:05:48 +0000 Subject: [PATCH 57/72] Reorganize miRNA test files --- .../ensembl/io/genomio/gff3/simplifier.py | 6 +-- src/python/tests/gff3/test_simplifier.py | 42 +++++++++---------- .../mirna/{mirna3_gene.gff => gene.gff} | 0 ...mirna3_gene_simped.gff => gene_simped.gff} | 6 +-- .../mirna/mirna6_pseudogene_simped.gff | 5 --- .../mirna/{mirna1.gff => nogene.gff} | 0 .../{mirna1_simped.gff => nogene_simped.gff} | 0 .../{mirna2_pseudo.gff => pseudo_nogene.gff} | 0 ...do_simped.gff => pseudo_nogene_simped.gff} | 0 .../{mirna6_pseudogene.gff => pseudogene.gff} | 0 .../mirna/pseudogene_simped.gff | 5 +++ ...mirna5_unsupported.gff => two_primary.gff} | 0 ...na4_unsupported.gff => unsupported_tr.gff} | 0 13 files changed, 31 insertions(+), 33 deletions(-) rename src/python/tests/gff3/test_simplifier/mirna/{mirna3_gene.gff => gene.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna3_gene_simped.gff => gene_simped.gff} (67%) delete mode 100644 src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff rename src/python/tests/gff3/test_simplifier/mirna/{mirna1.gff => nogene.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna1_simped.gff => nogene_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna2_pseudo.gff => pseudo_nogene.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna2_pseudo_simped.gff => pseudo_nogene_simped.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna6_pseudogene.gff => pseudogene.gff} (100%) create mode 100644 src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff rename src/python/tests/gff3/test_simplifier/mirna/{mirna5_unsupported.gff => two_primary.gff} (100%) rename src/python/tests/gff3/test_simplifier/mirna/{mirna4_unsupported.gff => unsupported_tr.gff} (100%) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index a69b74dac..4ea659c50 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -442,6 +442,8 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: gene.sub_features = [primary] gene.qualifiers = primary.qualifiers transcripts = gene.sub_features + gene.id = f"{base_id}_0" + gene.qualifiers["ID"] = gene.id if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): return [old_gene] @@ -451,10 +453,6 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: # Passed the checks primary = transcripts[0] - # Set ID of the top gene - gene.id = f"{base_id}_0" - gene.qualifiers["ID"] = gene.id - logging.debug(f"Formatting miRNA gene {gene.id}") new_genes = [] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 95f746f87..7617e3d5c 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -266,7 +266,7 @@ def test_simpler_gff3_feature( param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), - param("mirna/mirna1.gff", "mirna/mirna1_simped.gff", does_not_raise(), id="miRNA split"), + param("mirna/gene.gff", "mirna/gene_simped.gff", does_not_raise(), id="miRNA split"), ], ) def test_simpler_gff3( @@ -400,43 +400,43 @@ def test_gffsimplifier_with_genome( "ok_gene.gff", "ok_gene.gff", does_not_raise(), - id="Gene without miRNA", + id="normal gene", ), param( - "mirna/mirna1.gff", - "mirna/mirna1_simped.gff", + "mirna/gene.gff", + "mirna/gene_simped.gff", does_not_raise(), - id="Primary_transcript with 1 miRNA", + id="gene + primary_transcript + miRNA", ), param( - "mirna/mirna2_pseudo.gff", - "mirna/mirna2_pseudo_simped.gff", + "mirna/pseudogene.gff", + "mirna/pseudogene_simped.gff", does_not_raise(), - id="Pseudo primary_transcript", + id="gene + primary_transcript - miRNA", ), param( - "mirna/mirna3_gene.gff", - "mirna/mirna3_gene_simped.gff", + "mirna/nogene.gff", + "mirna/nogene_simped.gff", does_not_raise(), - id="Gene with Primary_transcript with 1 miRNA", + id="primary_transcript + miRNA", ), param( - "mirna/mirna4_unsupported.gff", + "mirna/pseudo_nogene.gff", + "mirna/pseudo_nogene_simped.gff", + does_not_raise(), + id="primary_transcript - miRNA", + ), + param( + "mirna/unsupported_tr.gff", "", raises(GFFParserError, match="Unknown subtype"), - id="Gene with Primary_transcript with mRNA, not supported", + id="gene + primary_transcript + mRNA, not supported", ), param( - "mirna/mirna5_unsupported.gff", + "mirna/two_primary.gff", "", raises(GFFParserError, match="too many sub_features"), - id="Gene with Primary_transcript with 2 miRNAs, not supported", - ), - param( - "mirna/mirna6_pseudogene.gff", - "mirna/mirna6_pseudogene_simped.gff", - does_not_raise(), - id="Gene with Primary_transcript and no miRNA (pseudogene)", + id="gene + 2x primary_transcript, not supported", ), ], ) diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff b/src/python/tests/gff3/test_simplifier/mirna/gene.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna3_gene.gff rename to src/python/tests/gff3/test_simplifier/mirna/gene.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff similarity index 67% rename from src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff rename to src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff index 487283c45..b45083490 100644 --- a/src/python/tests/gff3/test_simplifier/mirna/mirna3_gene_simped.gff +++ b/src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff @@ -1,8 +1,8 @@ ##gff-version 3 ##sequence-region scaffold1 1 1000 -scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 -scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 -scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_1 scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_1_t1;Parent=LOREMIPSUM1_1 scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_1_t1-E1;Parent=LOREMIPSUM1_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff deleted file mode 100644 index 502aa673c..000000000 --- a/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene_simped.gff +++ /dev/null @@ -1,5 +0,0 @@ -##gff-version 3 -##sequence-region scaffold1 1 1000 -scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 -scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 -scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna1.gff b/src/python/tests/gff3/test_simplifier/mirna/nogene.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna1.gff rename to src/python/tests/gff3/test_simplifier/mirna/nogene.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/nogene_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna1_simped.gff rename to src/python/tests/gff3/test_simplifier/mirna/nogene_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo.gff rename to src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene_simped.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna2_pseudo_simped.gff rename to src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene_simped.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudogene.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna6_pseudogene.gff rename to src/python/tests/gff3/test_simplifier/mirna/pseudogene.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff new file mode 100644 index 000000000..a552d5ca6 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff b/src/python/tests/gff3/test_simplifier/mirna/two_primary.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna5_unsupported.gff rename to src/python/tests/gff3/test_simplifier/mirna/two_primary.gff diff --git a/src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff b/src/python/tests/gff3/test_simplifier/mirna/unsupported_tr.gff similarity index 100% rename from src/python/tests/gff3/test_simplifier/mirna/mirna4_unsupported.gff rename to src/python/tests/gff3/test_simplifier/mirna/unsupported_tr.gff From 9803dd111e282799ac74b96c34c72087837c76f4 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 15:19:25 +0000 Subject: [PATCH 58/72] Fix: no change, don't return modified gene --- src/python/ensembl/io/genomio/gff3/simplifier.py | 4 ++-- src/python/tests/gff3/test_simplifier.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 4ea659c50..1e7fc5e24 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -123,7 +123,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: cleaned_features = [] for feature in record.features: split_genes = self.normalize_mirna(feature) - if len(split_genes) > 1: + if split_genes: cleaned_features += split_genes else: clean_feature = self.simpler_gff3_feature(feature) @@ -446,7 +446,7 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: gene.qualifiers["ID"] = gene.id if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): - return [old_gene] + return [] if len(transcripts) > 1: raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 7617e3d5c..50748dff4 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -266,7 +266,6 @@ def test_simpler_gff3_feature( param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), - param("mirna/gene.gff", "mirna/gene_simped.gff", does_not_raise(), id="miRNA split"), ], ) def test_simpler_gff3( @@ -440,7 +439,7 @@ def test_gffsimplifier_with_genome( ), ], ) -def test_normalize_mirna( +def test_simpler_gff3_mirna( data_dir: Path, tmp_dir: Path, assert_files: Callable, @@ -452,5 +451,7 @@ def test_normalize_mirna( input_gff = data_dir / in_gff output_gff = tmp_dir / Path(in_gff).name with expectation: - check_one_feature(input_gff, output_gff, "normalize_mirna") - assert_files(output_gff, Path(data_dir / expected_gff)) + simp = GFFSimplifier() + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) From 4a5f25126b1ae681b7bb55ab7db1c53fa06bb3c7 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 15:49:39 +0000 Subject: [PATCH 59/72] Bugfix: properly store annotation for translations --- src/python/ensembl/io/genomio/gff3/extract_annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py index 53bde3573..3979ca536 100644 --- a/src/python/ensembl/io/genomio/gff3/extract_annotation.py +++ b/src/python/ensembl/io/genomio/gff3/extract_annotation.py @@ -298,9 +298,9 @@ def store_gene(self, gene: SeqFeature) -> None: """Record the functional_annotations of a gene and its children features.""" self.add_feature(gene, "gene") - cds_found = False for transcript in gene.sub_features: self.add_feature(transcript, "transcript", gene.id) + cds_found = False for feat in transcript.sub_features: if feat.type != "CDS": continue From 61ffb3fcb4b900afd301eb7bee8cc6041ba92c6c Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Wed, 27 Mar 2024 16:08:01 +0000 Subject: [PATCH 60/72] Add test for fix --- .../tests/gff3/test_extract_annotation.py | 52 +++++++++++-------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py index 3f7ac1439..cdcc57f65 100644 --- a/src/python/tests/gff3/test_extract_annotation.py +++ b/src/python/tests/gff3/test_extract_annotation.py @@ -328,18 +328,23 @@ def test_transfer_descriptions( @pytest.mark.dependency(depends=["add_feature"]) @pytest.mark.parametrize( - "cds_parts, num_genes, num_tr, num_cds", + "num_cds, cds_parts, expected_num_genes, expected_num_tr, expected_num_cds", [ - pytest.param(0, 1, 1, 0, id="Store gene without CDS"), - pytest.param(1, 1, 1, 1, id="Store gene with CDS in one part"), - pytest.param(2, 1, 1, 1, id="Store gene with CDS in 2 parts"), + pytest.param(0, 0, 1, 1, 0, id="Store gene without CDS"), + pytest.param(1, 1, 1, 1, 1, id="Store gene with 1 CDS in one part"), + pytest.param(1, 2, 1, 1, 1, id="Store gene with 1 CDS in 2 parts"), + pytest.param(2, 1, 1, 2, 2, id="Store gene with 2 CDS in 1 part each"), + pytest.param(2, 2, 1, 2, 2, id="Store gene with 2 CDS in 2 part each"), ], ) -def test_store_gene(cds_parts: int, num_genes: int, num_tr: int, num_cds: int) -> None: +def test_store_gene( + cds_parts: int, num_cds: int, expected_num_genes: int, expected_num_tr: int, expected_num_cds: int +) -> None: """Test store_gene given a gene Feature with a transcript and optional translation. Args: - cds_parts: Number of parts of the one CDS (0 means no CDS) + cds_parts: Number of parts of each CDS + num_cds: Number of CDSs stored num_genes: Number of genes stored as expected num_tr: Number of transcripts stored as expected num_cds: Number of CDSs stored as expected @@ -349,22 +354,27 @@ def test_store_gene(cds_parts: int, num_genes: int, num_tr: int, num_cds: int) - transcript_name = "tran_A" one_gene = SeqFeature(type="gene", id=gene_name) one_gene.sub_features = [] - one_transcript = SeqFeature(type="mRNA", id=transcript_name) - one_transcript.sub_features = [] - - # Add one exon - one_exon = SeqFeature(type="exon", id="exon_A") - one_transcript.sub_features.append(one_exon) # Add a translation (possibly in parts) - if cds_parts > 0: - for _ in range(1, cds_parts + 1): - one_translation = SeqFeature(type="CDS", id="cds_A") - one_transcript.sub_features.append(one_translation) - - one_gene.sub_features.append(one_transcript) + if num_cds: + for cds_number in range(1, num_cds + 1): + transcript = SeqFeature(type="mRNA", id=f"tran_{cds_number}") + transcript.sub_features = [] + exon = SeqFeature(type="exon", id=f"exon_{cds_number}") + transcript.sub_features.append(exon) + if cds_parts > 0: + for _ in range(1, cds_parts + 1): + translation = SeqFeature(type="CDS", id=f"cds_{cds_number}") + transcript.sub_features.append(translation) + one_gene.sub_features.append(transcript) + else: + one_transcript = SeqFeature(type="mRNA", id=transcript_name) + one_transcript.sub_features = [] + one_exon = SeqFeature(type="exon", id="exon_A") + one_transcript.sub_features.append(one_exon) + one_gene.sub_features.append(one_transcript) annot.store_gene(one_gene) - assert len(annot.features["gene"]) == num_genes - assert len(annot.features["transcript"]) == num_tr - assert len(annot.features["translation"]) == num_cds + assert len(annot.features["gene"]) == expected_num_genes + assert len(annot.features["transcript"]) == expected_num_tr + assert len(annot.features["translation"]) == expected_num_cds From 40ea4e6d5e128839016d51f87a435af41e056dae Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 10 Apr 2024 15:42:57 +0100 Subject: [PATCH 61/72] simplify code to avoid using flags --- src/python/ensembl/io/genomio/gff3/extract_annotation.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py index 3979ca536..98b56b45c 100644 --- a/src/python/ensembl/io/genomio/gff3/extract_annotation.py +++ b/src/python/ensembl/io/genomio/gff3/extract_annotation.py @@ -300,11 +300,8 @@ def store_gene(self, gene: SeqFeature) -> None: for transcript in gene.sub_features: self.add_feature(transcript, "transcript", gene.id) - cds_found = False for feat in transcript.sub_features: - if feat.type != "CDS": - continue - # Store CDS functional annotation only once - if not cds_found: - cds_found = True + if feat.type == "CDS": self.add_feature(feat, "translation", transcript.id) + # Store CDS functional annotation only once + break From c926678480c3e244658aa22cbcbcda7a8b944ab1 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:11:03 +0100 Subject: [PATCH 62/72] Restrict what is in the with section --- src/python/tests/gff3/test_simplifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 50748dff4..f948e2682 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -453,5 +453,6 @@ def test_simpler_gff3_mirna( with expectation: simp = GFFSimplifier() simp.simpler_gff3(input_gff) + if expected_gff: simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) From 5469ba785baee3bc556eb5a5b923f0b74e650946 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:21:22 +0100 Subject: [PATCH 63/72] unify test_from_gff --- src/python/tests/gff3/test_records.py | 34 +++++++++---------- .../{invalid.gff3 => invalid.gff} | 0 2 files changed, 17 insertions(+), 17 deletions(-) rename src/python/tests/gff3/test_records/{invalid.gff3 => invalid.gff} (100%) diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 1b13f65a4..e0ed4718c 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -14,9 +14,10 @@ # limitations under the License. """Unit testing of `ensembl.io.genomio.gff3.simplifier.Records` class.""" +from contextlib import nullcontext as no_raise from os import PathLike from pathlib import Path -from typing import Callable, List, Optional +from typing import Callable, ContextManager, List, Optional import pytest from pytest import param, raises @@ -25,31 +26,30 @@ @pytest.mark.parametrize( - "in_gff, excluded, expected_loaded", + "in_gff, excluded, expected_loaded, expectation", [ - param("record_n2.gff", None, ["scaffold1", "scaffold2"], id="2 records"), - param("record_n2.gff", ["scaffold1"], ["scaffold2"], id="2 records, exclude 1"), - param("record_n1.gff", ["Lorem"], ["scaffold1"], id="1 record, exclude not in record"), + param("record_n2.gff", None, ["scaffold1", "scaffold2"], no_raise(), id="2 records"), + param("record_n2.gff", ["scaffold1"], ["scaffold2"], no_raise(), id="2 records, exclude 1"), + param("record_n1.gff", ["Lorem"], ["scaffold1"], no_raise(), id="1 record, exclude not in record"), + param("invalid.gff", None, [], raises(AssertionError), id="Invalid GFF3"), ], ) def test_from_gff( - data_dir: Path, in_gff: PathLike, excluded: Optional[List[str]], expected_loaded: List[str] + data_dir: Path, + in_gff: PathLike, + excluded: Optional[List[str]], + expected_loaded: List[str], + expectation: ContextManager, ) -> None: """Test loading GFF records from file.""" input_gff = data_dir / in_gff records = Records() - records.from_gff(input_gff, excluded) - record_names = [record.id for record in records] - assert record_names == expected_loaded - - -def test_from_gff_invalid(data_dir: Path) -> None: - """Test loading invalid GFF file.""" - input_gff = data_dir / "invalid.gff3" - records = Records() - with raises(AssertionError): - records.from_gff(input_gff) + with expectation: + records.from_gff(input_gff, excluded) + if expected_loaded: + record_names = [record.id for record in records] + assert record_names == expected_loaded @pytest.mark.parametrize( diff --git a/src/python/tests/gff3/test_records/invalid.gff3 b/src/python/tests/gff3/test_records/invalid.gff similarity index 100% rename from src/python/tests/gff3/test_records/invalid.gff3 rename to src/python/tests/gff3/test_records/invalid.gff From a6033efd326de57eef97117976c9ba64f2f3e833 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:27:46 +0100 Subject: [PATCH 64/72] Restrict with --- src/python/tests/gff3/test_simplifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index f948e2682..a394f8e5a 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -279,9 +279,10 @@ def test_simpler_gff3( """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff output_gff = tmp_dir / Path(in_gff).name + simp = GFFSimplifier() with expectation: - simp = GFFSimplifier() simp.simpler_gff3(input_gff) + if expected_gff: simp.records.to_gff(output_gff) assert_files(output_gff, data_dir / expected_gff) From 58cf4c3a4599d6264c265a994af14795ba52bc8b Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:30:59 +0100 Subject: [PATCH 65/72] simp creation outside with --- src/python/tests/gff3/test_simplifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index a394f8e5a..35a2abb06 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -451,8 +451,8 @@ def test_simpler_gff3_mirna( """Test normalizing miRNA genes.""" input_gff = data_dir / in_gff output_gff = tmp_dir / Path(in_gff).name + simp = GFFSimplifier() with expectation: - simp = GFFSimplifier() simp.simpler_gff3(input_gff) if expected_gff: simp.records.to_gff(output_gff) From aae62465ee2038fb0f8f4f15ed86d4cf7f798e9c Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:34:45 +0100 Subject: [PATCH 66/72] tmp_dir -> tmp_path --- src/python/tests/gff3/test_records.py | 4 +-- src/python/tests/gff3/test_simplifier.py | 32 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index e0ed4718c..0f282fe47 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -59,10 +59,10 @@ def test_from_gff( param("record_n2.gff", id="2 records"), ], ) -def test_to_gff(tmp_dir: Path, data_dir: Path, assert_files: Callable, in_gff: PathLike) -> None: +def test_to_gff(tmp_path: Path, data_dir: Path, assert_files: Callable, in_gff: PathLike) -> None: """Test writing GFF records to file.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_path / in_gff records = Records() records.from_gff(input_gff) records.to_gff(output_gff) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 35a2abb06..642a4686a 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -81,14 +81,14 @@ def test_create_gene_for_lone_transcript( ) def test_create_gene_for_lone_cds( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, ) -> None: """Test gene create gene for lone CDS.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") if new_feat: assert_files(output_gff, Path(data_dir / expected_gff)) @@ -219,14 +219,14 @@ def test_format_gene_segments( ) def test_clean_gene( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, ) -> None: """Test clean gene.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name new_feat = check_one_feature(input_gff, output_gff, "clean_gene") if new_feat: assert_files(output_gff, Path(data_dir / expected_gff)) @@ -244,14 +244,14 @@ def test_clean_gene( ) def test_simpler_gff3_feature( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: Optional[PathLike], ) -> None: """Test simplifying one gene (from a GFF3 file).""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_path / in_gff new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") if expected_gff is None: assert new_feat is None @@ -270,7 +270,7 @@ def test_simpler_gff3_feature( ) def test_simpler_gff3( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, @@ -278,7 +278,7 @@ def test_simpler_gff3( ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name simp = GFFSimplifier() with expectation: simp.simpler_gff3(input_gff) @@ -299,7 +299,7 @@ def test_simpler_gff3( ) def test_simpler_gff3_pseudogene( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, @@ -307,7 +307,7 @@ def test_simpler_gff3_pseudogene( ) -> None: """Test simplifying pseudogenes from GFF3 files.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name simp = GFFSimplifier() simp.allow_pseudogene_with_cds = allow_cds simp.simpler_gff3(input_gff) @@ -332,7 +332,7 @@ def test_simpler_gff3_pseudogene( ) def test_simpler_gff3_skip( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, @@ -341,7 +341,7 @@ def test_simpler_gff3_skip( ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_path / in_gff simp = GFFSimplifier(skip_unrecognized=skip_unrecognized) with expectation: simp.simpler_gff3(input_gff) @@ -375,7 +375,7 @@ def test_simpler_gff3_skip( ) def test_gffsimplifier_with_genome( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, genome_file: Optional[PathLike], in_gff: PathLike, @@ -383,7 +383,7 @@ def test_gffsimplifier_with_genome( ) -> None: """Test simplifying genes from GFF3 files.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / in_gff + output_gff = tmp_path / in_gff if genome_file is None: simp = GFFSimplifier() else: @@ -442,7 +442,7 @@ def test_gffsimplifier_with_genome( ) def test_simpler_gff3_mirna( data_dir: Path, - tmp_dir: Path, + tmp_path: Path, assert_files: Callable, in_gff: PathLike, expected_gff: PathLike, @@ -450,7 +450,7 @@ def test_simpler_gff3_mirna( ) -> None: """Test normalizing miRNA genes.""" input_gff = data_dir / in_gff - output_gff = tmp_dir / Path(in_gff).name + output_gff = tmp_path / Path(in_gff).name simp = GFFSimplifier() with expectation: simp.simpler_gff3(input_gff) From 3f8921103f542a7516a243bc652b2455ebde9bb3 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:39:31 +0100 Subject: [PATCH 67/72] Apply suggestions from code review Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/gff3/simplifier.py | 16 ++++++++++++---- src/python/tests/gff3/test_restructure.py | 5 +---- src/python/tests/gff3/test_simplifier.py | 9 ++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 1e7fc5e24..69665d7eb 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -56,14 +56,18 @@ def from_gff(self, in_gff_path: PathLike, excluded: Optional[List[str]] = None) with Path(in_gff_path).open("r") as in_gff_fh: for record in GFF.parse(in_gff_fh): if record.id in excluded: - logging.debug(f"Skip seq_region {record.id}") + logging.debug(f"Skip seq_region {record.id} - in exclusion list") continue clean_record = SeqRecord(record.seq, id=record.id) clean_record.features = record.features self.append(clean_record) def to_gff(self, out_gff_path: PathLike) -> None: - """Print out the current list of records in a GFF3 file.""" + """Writes the current list of records in a GFF3 file. + + Args: + out_gff_path: Path to GFF3 file where to write the records. + """ with Path(out_gff_path).open("w") as out_gff_fh: GFF.write(self, out_gff_fh) @@ -172,7 +176,7 @@ def create_gene_for_lone_transcript(self, feat: SeqFeature) -> SeqFeature: """Returns a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others. Args: - ncrna: The transcript for which we want to create a gene. + feat: The transcript for which we want to create a gene. """ transcript_types = self._biotypes["transcript"]["supported"] if feat.type not in transcript_types: @@ -195,7 +199,11 @@ def create_gene_for_lone_transcript(self, feat: SeqFeature) -> SeqFeature: return new_gene def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: - """Returns a gene created for a lone CDS.""" + """Returns a gene created for a lone CDS. + + Args: + feat: The transcript for which we want to create a gene. +""" if feat.type != "CDS": return feat diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index 06ea0e3b9..5d25f949f 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -353,10 +353,7 @@ def test_restructure_gene( param({"pseudogene": ["CDS", "CDS"]}, "pseudogene", id="pseudogene CDSs"), ], ) -def test_remove_cds_from_pseudogene( - children: List[Any], - expected_children: List[Any], -) -> None: +def test_remove_cds_from_pseudogene(children: List[Any], expected_children: List[Any]) -> None: """Test CDS removal from pseudogene.""" gen = FeatGenerator() gene = gen.make_structure([children])[0] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 642a4686a..56845496f 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Unit testing of `ensembl.io.genomio.gff3.restructure` module.""" +"""Unit testing of `ensembl.io.genomio.gff3.simplifier` module.""" from contextlib import nullcontext as does_not_raise from os import PathLike @@ -396,12 +396,7 @@ def test_gffsimplifier_with_genome( @pytest.mark.parametrize( "in_gff, expected_gff, expectation", [ - param( - "ok_gene.gff", - "ok_gene.gff", - does_not_raise(), - id="normal gene", - ), + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="normal gene"), param( "mirna/gene.gff", "mirna/gene_simped.gff", From d410b98ddaeae164af03945a147f268e3967635b Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:41:11 +0100 Subject: [PATCH 68/72] Typo --- src/python/ensembl/io/genomio/gff3/simplifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 69665d7eb..87ba9c954 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -202,8 +202,8 @@ def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: """Returns a gene created for a lone CDS. Args: - feat: The transcript for which we want to create a gene. -""" + feat: The CDS for which we want to create a gene. + """ if feat.type != "CDS": return feat From a948a904a5639d218994526054bbaf62f401dc15 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 17:43:12 +0100 Subject: [PATCH 69/72] Update doc --- src/python/tests/gff3/test_extract_annotation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py index cdcc57f65..81fc9879f 100644 --- a/src/python/tests/gff3/test_extract_annotation.py +++ b/src/python/tests/gff3/test_extract_annotation.py @@ -343,11 +343,11 @@ def test_store_gene( """Test store_gene given a gene Feature with a transcript and optional translation. Args: - cds_parts: Number of parts of each CDS num_cds: Number of CDSs stored - num_genes: Number of genes stored as expected - num_tr: Number of transcripts stored as expected - num_cds: Number of CDSs stored as expected + cds_parts: Number of parts of each CDS + expected_num_genes: Number of genes stored as expected + expected_num_tr: Number of transcripts stored as expected + expected_num_cds: Number of CDSs stored as expected """ annot = FunctionalAnnotations() gene_name = "gene_A" From a05f1de877a9800948e4efb97b3b734a9b0873c8 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 18:25:00 +0100 Subject: [PATCH 70/72] Exception instead of None for simpler_gff3_feature --- .../ensembl/io/genomio/gff3/exceptions.py | 14 +++++++ .../ensembl/io/genomio/gff3/simplifier.py | 17 ++++---- src/python/tests/gff3/test_simplifier.py | 40 +++++++++++-------- .../gff3/test_simplifier/gene_unsupported.gff | 3 ++ 4 files changed, 49 insertions(+), 25 deletions(-) create mode 100644 src/python/tests/gff3/test_simplifier/gene_unsupported.gff diff --git a/src/python/ensembl/io/genomio/gff3/exceptions.py b/src/python/ensembl/io/genomio/gff3/exceptions.py index b91d5d47b..d4e72df75 100644 --- a/src/python/ensembl/io/genomio/gff3/exceptions.py +++ b/src/python/ensembl/io/genomio/gff3/exceptions.py @@ -16,8 +16,22 @@ __all__ = [ "GFFParserError", + "IgnoredFeatureError", + "UnsupportedFeatureError", ] class GFFParserError(Exception): """Error when parsing a GFF3 file.""" + + def __init__(self, message): + super().__init__(message) + self.message = message + + +class IgnoredFeatureError(GFFParserError): + """GFF3 feature can be ignored.""" + + +class UnsupportedFeatureError(GFFParserError): + """GFF3 feature is not supported.""" diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 87ba9c954..8eae7591a 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -38,7 +38,7 @@ from .extract_annotation import FunctionalAnnotations from .id_allocator import StableIDAllocator from .restructure import restructure_gene, remove_cds_from_pseudogene -from .exceptions import GFFParserError +from .exceptions import GFFParserError, IgnoredFeatureError, UnsupportedFeatureError class Records(list): @@ -130,9 +130,11 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: if split_genes: cleaned_features += split_genes else: - clean_feature = self.simpler_gff3_feature(feature) - if clean_feature is not None: + try: + clean_feature = self.simpler_gff3_feature(feature) cleaned_features.append(clean_feature) + except (UnsupportedFeatureError, IgnoredFeatureError) as err: + logging.debug(err.message) record.features = cleaned_features if self.fail_types: @@ -141,17 +143,17 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: if not self.skip_unrecognized: raise GFFParserError("Unrecognized types found, abort") - def simpler_gff3_feature(self, gene: SeqFeature) -> Optional[SeqFeature]: + def simpler_gff3_feature(self, gene: SeqFeature) -> SeqFeature: """Creates a simpler version of a GFF3 feature. - If the feature is invalid/skippable, returns None. + Raises InvalidGFFFeature if the feature is invalid/skippable. """ # Special cases non_gene = self.normalize_non_gene(gene) if non_gene: return non_gene if gene.type in self._biotypes["gene"]["ignored"]: - return None + raise IgnoredFeatureError(f"Ignored type {gene.type} for {gene.id}") # Synonym if gene.type == "protein_coding_gene": @@ -164,8 +166,7 @@ def simpler_gff3_feature(self, gene: SeqFeature) -> Optional[SeqFeature]: # What to do with unsupported gene types if gene.type not in self._biotypes["gene"]["supported"]: self.fail_types.add(f"gene={gene.type}") - logging.debug(f"Unsupported gene type: {gene.type} (for {gene.id})") - return None + raise UnsupportedFeatureError(f"Unsupported type {gene.type} for {gene.id}") # Normalize and store gene = self.normalize_gene(gene) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 56845496f..f4c3739ea 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -25,6 +25,7 @@ from ensembl.io.genomio.gff3.exceptions import GFFParserError from ensembl.io.genomio.gff3.simplifier import GFFSimplifier +from ensembl.io.genomio.gff3.exceptions import IgnoredFeatureError, UnsupportedFeatureError def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> SeqFeature: @@ -36,14 +37,13 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: # Apply the named function check_method = getattr(simp, check_function) new_feat = check_method(feat) + # Put it back + if isinstance(new_feat, list): + simp.records[0].features = new_feat + else: + simp.records[0].features = [new_feat] + simp.records.to_gff(output_gff) - if new_feat is not None: - # Put it back - if isinstance(new_feat, list): - simp.records[0].features = new_feat - else: - simp.records[0].features = [new_feat] - simp.records.to_gff(output_gff) return new_feat @@ -233,13 +233,19 @@ def test_clean_gene( @pytest.mark.parametrize( - "in_gff, expected_gff", + "in_gff, expected_gff, expectation", [ - param("ok_gene.gff", "ok_gene.gff", id="ok gene"), - param("gene_ignored.gff", None, id="gene ignored"), - param("mobile_te.gff", "mobile_te.gff", id="TE"), - param("ok_protein_coding_gene.gff", "ok_gene.gff", id="ok protein_coding_gene"), - param("ok_tr_ignored.gff", "ok_gene.gff", id="ok gene with ignored transcripts/subtranscripts"), + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("gene_ignored.gff", None, raises(IgnoredFeatureError), id="gene ignored"), + param("gene_unsupported.gff", None, raises(UnsupportedFeatureError), id="gene unsupported"), + param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), + param("ok_protein_coding_gene.gff", "ok_gene.gff", does_not_raise(), id="ok protein_coding_gene"), + param( + "ok_tr_ignored.gff", + "ok_gene.gff", + does_not_raise(), + id="ok gene with ignored transcripts/subtranscripts", + ), ], ) def test_simpler_gff3_feature( @@ -248,14 +254,14 @@ def test_simpler_gff3_feature( assert_files: Callable, in_gff: PathLike, expected_gff: Optional[PathLike], + expectation: ContextManager, ) -> None: """Test simplifying one gene (from a GFF3 file).""" input_gff = data_dir / in_gff output_gff = tmp_path / in_gff - new_feat = check_one_feature(input_gff, output_gff, "simpler_gff3_feature") - if expected_gff is None: - assert new_feat is None - else: + with expectation: + check_one_feature(input_gff, output_gff, "simpler_gff3_feature") + if expected_gff is not None: assert_files(output_gff, Path(data_dir / expected_gff)) diff --git a/src/python/tests/gff3/test_simplifier/gene_unsupported.gff b/src/python/tests/gff3/test_simplifier/gene_unsupported.gff new file mode 100644 index 000000000..5ba5d05f3 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/gene_unsupported.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source loremipsum 1 1000 . - . ID=1 From 21956076587e5d456b0a579687ed1115a47499b1 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Fri, 12 Apr 2024 18:28:15 +0100 Subject: [PATCH 71/72] check_one_feature no return --- src/python/tests/gff3/test_simplifier.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index f4c3739ea..5abd728e5 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -28,7 +28,7 @@ from ensembl.io.genomio.gff3.exceptions import IgnoredFeatureError, UnsupportedFeatureError -def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> SeqFeature: +def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> None: """Load 1 feature from a GFF, apply a function, then write it back to a GFF.""" simp = GFFSimplifier() simp.records.from_gff(input_gff) @@ -44,8 +44,6 @@ def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: simp.records[0].features = [new_feat] simp.records.to_gff(output_gff) - return new_feat - @pytest.mark.parametrize( "in_gff, expected_gff", @@ -66,9 +64,8 @@ def test_create_gene_for_lone_transcript( """Test gene create gene for lone transcript.""" input_gff = data_dir / in_gff output_gff = tmp_path / Path(in_gff).name - new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( @@ -89,9 +86,8 @@ def test_create_gene_for_lone_cds( """Test gene create gene for lone CDS.""" input_gff = data_dir / in_gff output_gff = tmp_path / Path(in_gff).name - new_feat = check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( @@ -227,9 +223,8 @@ def test_clean_gene( """Test clean gene.""" input_gff = data_dir / in_gff output_gff = tmp_path / Path(in_gff).name - new_feat = check_one_feature(input_gff, output_gff, "clean_gene") - if new_feat: - assert_files(output_gff, Path(data_dir / expected_gff)) + check_one_feature(input_gff, output_gff, "clean_gene") + assert_files(output_gff, Path(data_dir / expected_gff)) @pytest.mark.parametrize( From efa874b04d7f3a27b9ff68258954f52c3cb59b45 Mon Sep 17 00:00:00 2001 From: Matthieu Barba Date: Mon, 15 Apr 2024 09:47:19 +0100 Subject: [PATCH 72/72] Update src/python/ensembl/io/genomio/gff3/simplifier.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/gff3/simplifier.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 8eae7591a..85502e134 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -146,7 +146,9 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: def simpler_gff3_feature(self, gene: SeqFeature) -> SeqFeature: """Creates a simpler version of a GFF3 feature. - Raises InvalidGFFFeature if the feature is invalid/skippable. + Raises: + IgnoredFeatureError: If the feature type is ignored. + UnsupportedFeatureError: If the feature type is not supported. """ # Special cases non_gene = self.normalize_non_gene(gene)