diff --git a/src/python/ensembl/io/genomio/gff3/exceptions.py b/src/python/ensembl/io/genomio/gff3/exceptions.py index b91d5d47b..d4e72df75 100644 --- a/src/python/ensembl/io/genomio/gff3/exceptions.py +++ b/src/python/ensembl/io/genomio/gff3/exceptions.py @@ -16,8 +16,22 @@ __all__ = [ "GFFParserError", + "IgnoredFeatureError", + "UnsupportedFeatureError", ] class GFFParserError(Exception): """Error when parsing a GFF3 file.""" + + def __init__(self, message): + super().__init__(message) + self.message = message + + +class IgnoredFeatureError(GFFParserError): + """GFF3 feature can be ignored.""" + + +class UnsupportedFeatureError(GFFParserError): + """GFF3 feature is not supported.""" diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py index 53bde3573..98b56b45c 100644 --- a/src/python/ensembl/io/genomio/gff3/extract_annotation.py +++ b/src/python/ensembl/io/genomio/gff3/extract_annotation.py @@ -298,13 +298,10 @@ def store_gene(self, gene: SeqFeature) -> None: """Record the functional_annotations of a gene and its children features.""" self.add_feature(gene, "gene") - cds_found = False for transcript in gene.sub_features: self.add_feature(transcript, "transcript", gene.id) for feat in transcript.sub_features: - if feat.type != "CDS": - continue - # Store CDS functional annotation only once - if not cds_found: - cds_found = True + if feat.type == "CDS": self.add_feature(feat, "translation", transcript.id) + # Store CDS functional annotation only once + break diff --git a/src/python/ensembl/io/genomio/gff3/restructure.py b/src/python/ensembl/io/genomio/gff3/restructure.py index 56531b146..a4c77df71 100644 --- a/src/python/ensembl/io/genomio/gff3/restructure.py +++ b/src/python/ensembl/io/genomio/gff3/restructure.py @@ -22,6 +22,7 @@ "move_only_exons_to_new_mrna", "move_cds_to_existing_mrna", "remove_extra_exons", + "remove_cds_from_pseudogene", ] from collections import Counter @@ -264,3 +265,28 @@ def remove_extra_exons(gene: SeqFeature) -> None: gene.sub_features += others else: raise GFFParserError(f"Can't remove extra exons for {gene.id}, not all start with 'id-'") + + +def remove_cds_from_pseudogene(gene: SeqFeature) -> None: + """Removes the CDSs from a pseudogene. + + This assumes the CDSs are sub features of the transcript or the gene. + + """ + if gene.type != "pseudogene": + return + + gene_subfeats = [] + for transcript in gene.sub_features: + if transcript.type == "CDS": + logging.debug(f"Remove pseudo CDS {transcript.id}") + else: + new_subfeats = [] + for feat in transcript.sub_features: + if feat.type == "CDS": + logging.debug(f"Remove pseudo CDS {feat.id}") + else: + new_subfeats.append(feat) + transcript.sub_features = new_subfeats + gene_subfeats.append(transcript) + gene.sub_features = gene_subfeats diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index ec51af815..85502e134 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -18,7 +18,6 @@ __all__ = [ "Records", - "GFFParserError", "GFFSimplifier", ] @@ -27,7 +26,7 @@ from os import PathLike from pathlib import Path import re -from typing import Dict, List, Optional +from typing import List, Optional, Set from BCBio import GFF from Bio.SeqRecord import SeqRecord @@ -38,14 +37,33 @@ from ensembl.io.genomio.utils.json_utils import get_json from .extract_annotation import FunctionalAnnotations from .id_allocator import StableIDAllocator -from .restructure import restructure_gene +from .restructure import restructure_gene, remove_cds_from_pseudogene +from .exceptions import GFFParserError, IgnoredFeatureError, UnsupportedFeatureError class Records(list): """List of GFF3 SeqRecords.""" + def from_gff(self, in_gff_path: PathLike, excluded: Optional[List[str]] = None) -> None: + """Loads records from a GFF3 file. + + Args: + in_gff_path: Input GFF3 file path. + excluded: Record IDs to not load from the GFF3 file. + """ + if excluded is None: + excluded = [] + with Path(in_gff_path).open("r") as in_gff_fh: + for record in GFF.parse(in_gff_fh): + if record.id in excluded: + logging.debug(f"Skip seq_region {record.id} - in exclusion list") + continue + clean_record = SeqRecord(record.seq, id=record.id) + clean_record.features = record.features + self.append(clean_record) + def to_gff(self, out_gff_path: PathLike) -> None: - """Print out the current list of records in a GFF3 file. + """Writes the current list of records in a GFF3 file. Args: out_gff_path: Path to GFF3 file where to write the records. @@ -54,10 +72,6 @@ def to_gff(self, out_gff_path: PathLike) -> None: GFF.write(self, out_gff_fh) -class GFFParserError(Exception): - """Error when parsing a GFF3 file.""" - - class GFFSimplifier: """Parse a GGF3 file and output a cleaned up GFF3 + annotation json file. @@ -97,8 +111,8 @@ def __init__( # Other preparations self.stable_ids = StableIDAllocator() self.stable_ids.set_prefix(self.genome) - self.exclude_seq_regions: List = [] - self.fail_types: Dict[str, int] = {} + self.exclude_seq_regions: List[str] = [] + self.fail_types: Set = set() # Init the actual data we will store self.records = Records() @@ -108,94 +122,181 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None: """Loads a GFF3 from INSDC and rewrites it in a simpler version, whilst also writing a functional annotation file. """ + self.records.from_gff(in_gff_path, self.exclude_seq_regions) + for record in self.records: + cleaned_features = [] + for feature in record.features: + split_genes = self.normalize_mirna(feature) + if split_genes: + cleaned_features += split_genes + else: + try: + clean_feature = self.simpler_gff3_feature(feature) + cleaned_features.append(clean_feature) + except (UnsupportedFeatureError, IgnoredFeatureError) as err: + logging.debug(err.message) + record.features = cleaned_features + + if self.fail_types: + fail_errors = "\n ".join(list(self.fail_types)) + logging.warning(f"Unrecognized types found:\n {fail_errors}") + if not self.skip_unrecognized: + raise GFFParserError("Unrecognized types found, abort") + + def simpler_gff3_feature(self, gene: SeqFeature) -> SeqFeature: + """Creates a simpler version of a GFF3 feature. - with Path(in_gff_path).open("r") as in_gff_fh: - for record in GFF.parse(in_gff_fh): - if record.id in self.exclude_seq_regions: - logging.debug(f"Skip seq_region {record.id}") - continue + Raises: + IgnoredFeatureError: If the feature type is ignored. + UnsupportedFeatureError: If the feature type is not supported. + """ + # Special cases + non_gene = self.normalize_non_gene(gene) + if non_gene: + return non_gene + if gene.type in self._biotypes["gene"]["ignored"]: + raise IgnoredFeatureError(f"Ignored type {gene.type} for {gene.id}") + + # Synonym + if gene.type == "protein_coding_gene": + gene.type = "gene" - # Clean all root features and make clean record - clean_record = SeqRecord(record.seq, id=record.id) - for feature in record.features: - split_genes = self.normalize_mirna(feature) - if split_genes: - clean_record.features += split_genes - else: - clean_feature = self.simpler_gff3_feature(feature) - if clean_feature is not None: - clean_record.features.append(clean_feature) - self.records.append(clean_record) + # Lone sub-gene features, create a gene + gene = self.create_gene_for_lone_transcript(gene) + gene = self.create_gene_for_lone_cds(gene) - if self.fail_types: - fail_errors = ", ".join(self.fail_types.keys()) - logging.warning(f"Unrecognized types found: {fail_errors}") - if not self.skip_unrecognized: - raise GFFParserError("Unrecognized types found, abort") + # What to do with unsupported gene types + if gene.type not in self._biotypes["gene"]["supported"]: + self.fail_types.add(f"gene={gene.type}") + raise UnsupportedFeatureError(f"Unsupported type {gene.type} for {gene.id}") - def simpler_gff3_feature(self, feat: SeqFeature) -> Optional[SeqFeature]: - """Creates a simpler version of a GFF3 feature. + # Normalize and store + gene = self.normalize_gene(gene) + self.annotations.store_gene(gene) + return self.clean_gene(gene) - If the feature is invalid/skippable, returns None. + def create_gene_for_lone_transcript(self, feat: SeqFeature) -> SeqFeature: + """Returns a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA_gene' for all others. + Args: + feat: The transcript for which we want to create a gene. """ - - ignored_gene_types = self._biotypes["gene"]["ignored"] - allowed_non_gene_types = self._biotypes["non_gene"]["supported"] - allowed_gene_types = self._biotypes["gene"]["supported"] transcript_types = self._biotypes["transcript"]["supported"] + if feat.type not in transcript_types: + return feat - # Skip explictly ignored features - if feat.type in ignored_gene_types: - return None + new_type = "ncRNA_gene" + if feat.type in ("tRNA", "rRNA"): + new_type = "gene" + logging.debug(f"Put the transcript {feat.type} in a {new_type} parent feature") + new_gene = SeqFeature(feat.location, type=new_type) + new_gene.qualifiers["source"] = feat.qualifiers["source"] + new_gene.sub_features = [feat] - # Special processing of non-gene features - if feat.type in allowed_non_gene_types: - if feat.type in ("mobile_genetic_element", "transposable_element"): - feat = self.format_mobile_element(feat) - return feat - return None + # Use the transcript ID for the gene, and generate a sub ID for the transcript + new_gene.id = feat.id + new_gene.qualifiers["ID"] = new_gene.id + feat.id = self.stable_ids.generate_transcript_id(new_gene.id, 1) + feat.qualifiers["ID"] = feat.id - # From here we expect only genes - gene = feat + return new_gene - if gene.type == "protein_coding_gene": - gene.type = "gene" + def create_gene_for_lone_cds(self, feat: SeqFeature) -> SeqFeature: + """Returns a gene created for a lone CDS. + + Args: + feat: The CDS for which we want to create a gene. + """ + if feat.type != "CDS": + return feat - # Create actual genes from transcripts/CDS top level features - if gene.type in transcript_types: - gene = self.transcript_gene(gene) - elif gene.type == "CDS": - gene = self.cds_gene(gene) + logging.debug(f"Put the lone CDS in gene-mRNA parent features for {feat.id}") - # What to do with unsupported gene types - if gene.type not in allowed_gene_types: - self.fail_types["gene=" + gene.type] = 1 - logging.debug(f"Unsupported gene type: {gene.type} (for {gene.id})") + # Create a transcript, add the CDS + transcript = SeqFeature(feat.location, type="mRNA") + transcript.qualifiers["source"] = feat.qualifiers["source"] + transcript.sub_features = [feat] + + # Add an exon too + exon = SeqFeature(feat.location, type="exon") + exon.qualifiers["source"] = feat.qualifiers["source"] + transcript.sub_features.append(exon) + + # Create a gene, add the transcript + gene_type = "gene" + if ("pseudo" in feat.qualifiers) and (feat.qualifiers["pseudo"][0] == "true"): + gene_type = "pseudogene" + del feat.qualifiers["pseudo"] + new_gene = SeqFeature(feat.location, type=gene_type) + new_gene.qualifiers["source"] = feat.qualifiers["source"] + new_gene.sub_features = [transcript] + new_gene.id = self.stable_ids.generate_gene_id() + new_gene.qualifiers["ID"] = new_gene.id + transcript.id = self.stable_ids.generate_transcript_id(new_gene.id, 1) + transcript.qualifiers["ID"] = transcript.id + + return new_gene + + def normalize_non_gene(self, feat: SeqFeature) -> Optional[SeqFeature]: + """Returns a normalised "non-gene" or `None` if not applicable. + + Only transposable elements supported at the moment. + + Args: + feat: Feature to normalise. + + Raises: + NotImplementedError: If the feature is a not supported non-gene. + """ + + if feat.type not in self._biotypes["non_gene"]["supported"]: return None + if feat.type in ("mobile_genetic_element", "transposable_element"): + feat.type = "transposable_element" + feat = self._normalize_mobile_genetic_element(feat) + # Generate ID if needed + feat.id = self.stable_ids.normalize_gene_id(feat) + feat.qualifiers["ID"] = feat.id + + self.annotations.add_feature(feat, "transposable_element") + return self.clean_gene(feat) + # This is a failsafe in case you add supported non-genes + raise NotImplementedError(f"Unsupported non-gene: {feat.type} for {feat.id}") + + def _normalize_mobile_genetic_element(self, feat: SeqFeature) -> SeqFeature: + """Normalize a mobile element if it has a mobile_element_type field.""" + try: + mobile_element_type = feat.qualifiers["mobile_element_type"] + except KeyError: + logging.warning("No 'mobile_element_type' tag found") + return feat - # Normalize, store annotation, and return the cleaned up gene - gene = self.normalize_gene(gene) - self.annotations.store_gene(gene) - return self.clean_gene(gene) + # Get the type (and name) from the attrib + element_type, _, element_name = mobile_element_type[0].partition(":") + description = element_type + if element_name: + description += f" ({element_name})" + + # Keep the metadata in the description if the type is known + if element_type in ("transposon", "retrotransposon"): + if not feat.qualifiers.get("product"): + feat.qualifiers["product"] = [description] + return feat + raise GFFParserError(f"'mobile_element_type' is not a transposon: {element_type}") def clean_gene(self, gene: SeqFeature) -> SeqFeature: """Return the same gene without qualifiers unrelated to the gene structure.""" old_gene_qualifiers = gene.qualifiers - try: - gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} - except KeyError as err: - raise KeyError(f"Missing source for {gene.id}") from err + gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} for transcript in gene.sub_features: # Replace qualifiers old_transcript_qualifiers = transcript.qualifiers transcript.qualifiers = { "ID": transcript.id, "Parent": gene.id, + "source": old_transcript_qualifiers["source"], } - if "source" in old_transcript_qualifiers: - transcript.qualifiers["source"] = old_transcript_qualifiers["source"] for feat in transcript.sub_features: old_qualifiers = feat.qualifiers @@ -205,82 +306,10 @@ def clean_gene(self, gene: SeqFeature) -> SeqFeature: "source": old_qualifiers["source"], } if feat.type == "CDS": - try: - feat.qualifiers["phase"] = old_qualifiers["phase"] - except KeyError as err: - raise KeyError( - f"Missing phase for gene {gene.type} {gene.id}, CDS {feat.id} ({old_qualifiers})" - ) from err + feat.qualifiers["phase"] = old_qualifiers["phase"] return gene - # FORMATTERS - def format_mobile_element(self, feat: SeqFeature) -> SeqFeature: - """Given a mobile_genetic_element feature, transform it into a transposable_element""" - - # Change mobile_genetic_element into a transposable_element feature - if feat.type == "mobile_genetic_element": - mobile_element_type = feat.qualifiers.get("mobile_element_type", []) - if mobile_element_type: - # Get the type (and name) from the attrib - if ":" in mobile_element_type[0]: - element_type, element_name = mobile_element_type[0].split(":") - description = f"{element_type} ({element_name})" - else: - element_type = mobile_element_type[0] - description = element_type - - # Keep the metadata in the description if the type is known - if element_type in ("transposon", "retrotransposon"): - feat.type = "transposable_element" - if not feat.qualifiers.get("product"): - feat.qualifiers["product"] = [description] - else: - logging.warning( - f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}" - ) - return feat - else: - logging.warning("Mobile genetic element does not have a 'mobile_element_type' tag") - return feat - elif feat.type == "transposable_element": - pass - else: - logging.warning(f"Feature {feat.id} is not a supported TE feature {feat.type}") - return feat - - # Generate ID if needed and add it to the functional annotation - feat.id = self.stable_ids.normalize_gene_id(feat) - self.annotations.add_feature(feat, "transposable_element") - feat.qualifiers = {"ID": feat.id} - - return feat - - def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: - """Returns the equivalent Ensembl biotype feature for gene segment transcript features. - - Supported features: "C_gene_segment" and "V_gene_segment". - - Args: - transcript: Gene segment transcript feature. - - """ - # Change V/C_gene_segment into a its corresponding transcript names - if transcript.type in ("C_gene_segment", "V_gene_segment"): - standard_name = transcript.qualifiers["standard_name"][0] - biotype = transcript.type.replace("_segment", "") - if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): - biotype = f"IG_{biotype}" - elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): - biotype = f"TR_{biotype}" - else: - logging.warning( - f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}" - ) - return transcript - transcript.type = biotype - return transcript - def normalize_gene(self, gene: SeqFeature) -> SeqFeature: """Returns a normalized gene structure, separate from the functional elements. @@ -305,7 +334,7 @@ def normalize_pseudogene(self, gene: SeqFeature) -> None: if self.allow_pseudogene_with_cds: self.stable_ids.normalize_pseudogene_cds_id(gene) else: - self.remove_cds_from_pseudogene(gene) + remove_cds_from_pseudogene(gene) def normalize_transcripts(self, gene: SeqFeature) -> None: """Normalizes a transcript.""" @@ -319,7 +348,7 @@ def normalize_transcripts(self, gene: SeqFeature) -> None: transcript.type not in allowed_transcript_types and transcript.type not in ignored_transcript_types ): - self.fail_types["transcript=" + transcript.type] = 1 + self.fail_types.add(f"transcript={transcript.type}") logging.warning( f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})" ) @@ -339,9 +368,36 @@ def normalize_transcripts(self, gene: SeqFeature) -> None: for elt in sorted(transcripts_to_delete, reverse=True): gene.sub_features.pop(elt) + def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature: + """Returns the equivalent Ensembl biotype feature for gene segment transcript features. + + Supported features: "C_gene_segment" and "V_gene_segment". + + Args: + transcript: Gene segment transcript feature. + + Raises: + GFFParserError: Missing or unexpected transcript's standard name. + """ + if transcript.type not in ("C_gene_segment", "V_gene_segment"): + return transcript + + # Change V/C_gene_segment into a its corresponding transcript names + try: + standard_name = transcript.qualifiers["standard_name"][0] + except KeyError as err: + raise GFFParserError(f"No standard_name for {transcript.type}") from err + biotype = transcript.type.replace("_segment", "") + if re.search(r"\b(immunoglobulin|ig)\b", standard_name, flags=re.IGNORECASE): + transcript.type = f"IG_{biotype}" + elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE): + transcript.type = f"TR_{biotype}" + else: + raise GFFParserError(f"Unexpected 'standard_name' for {transcript.id}: {standard_name}") + return transcript + def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFeature) -> SeqFeature: """Returns a transcript with normalized sub-features.""" - ignored_transcript_types = self._biotypes["transcript"]["ignored"] exons_to_delete = [] exon_number = 1 for tcount, feat in enumerate(transcript.sub_features): @@ -352,19 +408,18 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea # Replace qualifiers old_exon_qualifiers = feat.qualifiers feat.qualifiers = {"Parent": transcript.id} - if "source" in old_exon_qualifiers: - feat.qualifiers["source"] = old_exon_qualifiers["source"] + feat.qualifiers["source"] = old_exon_qualifiers["source"] elif feat.type == "CDS": # New CDS ID feat.id = self.stable_ids.normalize_cds_id(feat.id) if feat.id in ("", gene.id, transcript.id): feat.id = f"{transcript.id}_cds" else: - if feat.type in ignored_transcript_types: + if feat.type in self._biotypes["transcript"]["ignored"]: exons_to_delete.append(tcount) continue - self.fail_types[f"sub_transcript={feat.type}"] = 1 + self.fail_types.add(f"sub_transcript={feat.type}") logging.warning( f"Unrecognized exon type for {feat.type}: {feat.id}" f" (for transcript {transcript.id} of type {transcript.type})" @@ -377,78 +432,6 @@ def _normalize_transcript_subfeatures(self, gene: SeqFeature, transcript: SeqFea transcript.sub_features.pop(elt) return transcript - # COMPLETION - def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature: - """Create a gene for lone transcripts: 'gene' for tRNA/rRNA, and 'ncRNA' for all others - - Args: - ncrna: the transcript for which we want to create a gene. - - Returns: - The gene that contains the transcript. - - """ - new_type = "ncRNA_gene" - if ncrna.type in ("tRNA", "rRNA"): - new_type = "gene" - logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature") - gene = SeqFeature(ncrna.location, type=new_type) - gene.qualifiers["source"] = ncrna.qualifiers["source"] - gene.sub_features = [ncrna] - gene.id = ncrna.id - - return gene - - def cds_gene(self, cds: SeqFeature) -> SeqFeature: - """Returns a gene created for a lone CDS.""" - - logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}") - - # Create a transcript, add the CDS - transcript = SeqFeature(cds.location, type="mRNA") - transcript.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features = [cds] - - # Add an exon too - exon = SeqFeature(cds.location, type="exon") - exon.qualifiers["source"] = cds.qualifiers["source"] - transcript.sub_features.append(exon) - - # Create a gene, add the transcript - gene_type = "gene" - if ("pseudo" in cds.qualifiers) and (cds.qualifiers["pseudo"][0] == "true"): - gene_type = "pseudogene" - gene = SeqFeature(cds.location, type=gene_type) - gene.qualifiers["source"] = cds.qualifiers["source"] - gene.sub_features = [transcript] - gene.id = self.stable_ids.generate_gene_id() - - return gene - - def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None: - """Removes the CDS from a pseudogene. - - This assumes the CDSs are sub features of the transcript or the gene. - - """ - if gene.type != "pseudogene": - return - - gene_subfeats = [] - for transcript in gene.sub_features: - if transcript.type == "CDS": - logging.debug(f"Remove pseudo CDS {transcript.id}") - continue - new_subfeats = [] - for feat in transcript.sub_features: - if feat.type == "CDS": - logging.debug(f"Remove pseudo CDS {feat.id}") - continue - new_subfeats.append(feat) - transcript.sub_features = new_subfeats - gene_subfeats.append(transcript) - gene.sub_features = gene_subfeats - def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: """Returns gene representations from a miRNA gene that can be loaded in an Ensembl database. @@ -459,16 +442,30 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: GFFParserError: If gene has more than 1 transcript, the transcript was not formatted correctly or there are unknown sub-features. """ - - transcript = gene.sub_features - if (len(transcript) == 0) or (transcript[0].type != "primary_transcript"): + base_id = gene.id + transcripts = gene.sub_features + + # Insert main gene first if needed + old_gene = gene + if gene.type == "primary_transcript": + primary = old_gene + gene = SeqFeature(primary.location, type="gene") + gene.sub_features = [primary] + gene.qualifiers = primary.qualifiers + transcripts = gene.sub_features + gene.id = f"{base_id}_0" + gene.qualifiers["ID"] = gene.id + + if (len(transcripts) == 0) or (transcripts[0].type != "primary_transcript"): return [] - if len(transcript) > 1: + if len(transcripts) > 1: raise GFFParserError(f"Gene has too many sub_features for miRNA {gene.id}") + # Passed the checks + primary = transcripts[0] + logging.debug(f"Formatting miRNA gene {gene.id}") - primary = transcript[0] new_genes = [] new_primary_subfeatures = [] num = 1 @@ -476,7 +473,7 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: if sub.type == "exon": new_primary_subfeatures.append(sub) elif sub.type == "miRNA": - new_gene_id = f"{gene.id}_{num}" + new_gene_id = f"{base_id}_{num}" num += 1 new_gene = SeqFeature(sub.location, "gene", id=new_gene_id) new_gene.qualifiers = {"source": sub.qualifiers["source"], "ID": new_gene_id} @@ -487,9 +484,10 @@ def normalize_mirna(self, gene: SeqFeature) -> List[SeqFeature]: primary.sub_features = new_primary_subfeatures if not new_genes: - raise GFFParserError(f"Could not parse a primary_transcript for {gene.id}") - - all_genes = [gene] + new_genes + logging.debug(f"Primary_transcript without miRNA in {gene.id}") + all_genes = [gene] + else: + all_genes = [gene] + new_genes # Normalize like other genes all_genes_cleaned = [] diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index b6021a119..4ec58cfb4 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -76,7 +76,14 @@ def _assert_files(result_path: Path, expected_path: Path) -> None: results = result_fh.readlines() with open(expected_path, "r") as expected_fh: expected = expected_fh.readlines() - files_diff = list(unified_diff(results, expected, fromfile="Test-made file", tofile="Expected file")) + files_diff = list( + unified_diff( + results, + expected, + fromfile=f"Test-made file {result_path.name}", + tofile=f"Expected file {expected_path.name}", + ) + ) assert_message = f"Test-made and expected files differ\n{' '.join(files_diff)}" assert len(files_diff) == 0, assert_message diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py index 3f7ac1439..81fc9879f 100644 --- a/src/python/tests/gff3/test_extract_annotation.py +++ b/src/python/tests/gff3/test_extract_annotation.py @@ -328,43 +328,53 @@ def test_transfer_descriptions( @pytest.mark.dependency(depends=["add_feature"]) @pytest.mark.parametrize( - "cds_parts, num_genes, num_tr, num_cds", + "num_cds, cds_parts, expected_num_genes, expected_num_tr, expected_num_cds", [ - pytest.param(0, 1, 1, 0, id="Store gene without CDS"), - pytest.param(1, 1, 1, 1, id="Store gene with CDS in one part"), - pytest.param(2, 1, 1, 1, id="Store gene with CDS in 2 parts"), + pytest.param(0, 0, 1, 1, 0, id="Store gene without CDS"), + pytest.param(1, 1, 1, 1, 1, id="Store gene with 1 CDS in one part"), + pytest.param(1, 2, 1, 1, 1, id="Store gene with 1 CDS in 2 parts"), + pytest.param(2, 1, 1, 2, 2, id="Store gene with 2 CDS in 1 part each"), + pytest.param(2, 2, 1, 2, 2, id="Store gene with 2 CDS in 2 part each"), ], ) -def test_store_gene(cds_parts: int, num_genes: int, num_tr: int, num_cds: int) -> None: +def test_store_gene( + cds_parts: int, num_cds: int, expected_num_genes: int, expected_num_tr: int, expected_num_cds: int +) -> None: """Test store_gene given a gene Feature with a transcript and optional translation. Args: - cds_parts: Number of parts of the one CDS (0 means no CDS) - num_genes: Number of genes stored as expected - num_tr: Number of transcripts stored as expected - num_cds: Number of CDSs stored as expected + num_cds: Number of CDSs stored + cds_parts: Number of parts of each CDS + expected_num_genes: Number of genes stored as expected + expected_num_tr: Number of transcripts stored as expected + expected_num_cds: Number of CDSs stored as expected """ annot = FunctionalAnnotations() gene_name = "gene_A" transcript_name = "tran_A" one_gene = SeqFeature(type="gene", id=gene_name) one_gene.sub_features = [] - one_transcript = SeqFeature(type="mRNA", id=transcript_name) - one_transcript.sub_features = [] - - # Add one exon - one_exon = SeqFeature(type="exon", id="exon_A") - one_transcript.sub_features.append(one_exon) # Add a translation (possibly in parts) - if cds_parts > 0: - for _ in range(1, cds_parts + 1): - one_translation = SeqFeature(type="CDS", id="cds_A") - one_transcript.sub_features.append(one_translation) - - one_gene.sub_features.append(one_transcript) + if num_cds: + for cds_number in range(1, num_cds + 1): + transcript = SeqFeature(type="mRNA", id=f"tran_{cds_number}") + transcript.sub_features = [] + exon = SeqFeature(type="exon", id=f"exon_{cds_number}") + transcript.sub_features.append(exon) + if cds_parts > 0: + for _ in range(1, cds_parts + 1): + translation = SeqFeature(type="CDS", id=f"cds_{cds_number}") + transcript.sub_features.append(translation) + one_gene.sub_features.append(transcript) + else: + one_transcript = SeqFeature(type="mRNA", id=transcript_name) + one_transcript.sub_features = [] + one_exon = SeqFeature(type="exon", id="exon_A") + one_transcript.sub_features.append(one_exon) + one_gene.sub_features.append(one_transcript) annot.store_gene(one_gene) - assert len(annot.features["gene"]) == num_genes - assert len(annot.features["transcript"]) == num_tr - assert len(annot.features["translation"]) == num_cds + assert len(annot.features["gene"]) == expected_num_genes + assert len(annot.features["transcript"]) == expected_num_tr + assert len(annot.features["translation"]) == expected_num_cds diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py new file mode 100644 index 000000000..0f282fe47 --- /dev/null +++ b/src/python/tests/gff3/test_records.py @@ -0,0 +1,69 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.gff3.simplifier.Records` class.""" + +from contextlib import nullcontext as no_raise +from os import PathLike +from pathlib import Path +from typing import Callable, ContextManager, List, Optional + +import pytest +from pytest import param, raises + +from ensembl.io.genomio.gff3.simplifier import Records + + +@pytest.mark.parametrize( + "in_gff, excluded, expected_loaded, expectation", + [ + param("record_n2.gff", None, ["scaffold1", "scaffold2"], no_raise(), id="2 records"), + param("record_n2.gff", ["scaffold1"], ["scaffold2"], no_raise(), id="2 records, exclude 1"), + param("record_n1.gff", ["Lorem"], ["scaffold1"], no_raise(), id="1 record, exclude not in record"), + param("invalid.gff", None, [], raises(AssertionError), id="Invalid GFF3"), + ], +) +def test_from_gff( + data_dir: Path, + in_gff: PathLike, + excluded: Optional[List[str]], + expected_loaded: List[str], + expectation: ContextManager, +) -> None: + """Test loading GFF records from file.""" + input_gff = data_dir / in_gff + + records = Records() + with expectation: + records.from_gff(input_gff, excluded) + if expected_loaded: + record_names = [record.id for record in records] + assert record_names == expected_loaded + + +@pytest.mark.parametrize( + "in_gff", + [ + param("record_n1.gff", id="1 record"), + param("record_n2.gff", id="2 records"), + ], +) +def test_to_gff(tmp_path: Path, data_dir: Path, assert_files: Callable, in_gff: PathLike) -> None: + """Test writing GFF records to file.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / in_gff + records = Records() + records.from_gff(input_gff) + records.to_gff(output_gff) + assert_files(input_gff, output_gff) diff --git a/src/python/tests/gff3/test_records/invalid.gff b/src/python/tests/gff3/test_records/invalid.gff new file mode 100644 index 000000000..f1f10f6d2 --- /dev/null +++ b/src/python/tests/gff3/test_records/invalid.gff @@ -0,0 +1,2 @@ +>Lorem ipsum +HAHAHA diff --git a/src/python/tests/gff3/test_records/record_n1.gff b/src/python/tests/gff3/test_records/record_n1.gff new file mode 100644 index 000000000..8bd717f15 --- /dev/null +++ b/src/python/tests/gff3/test_records/record_n1.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_records/record_n2.gff b/src/python/tests/gff3/test_records/record_n2.gff new file mode 100644 index 000000000..82996e1d8 --- /dev/null +++ b/src/python/tests/gff3/test_records/record_n2.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +##sequence-region scaffold2 1 1000 +scaffold2 Source gene 1 1000 . - . ID=LOREMIPSUM2 diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index b3d1779c1..5d25f949f 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -334,3 +334,28 @@ def test_restructure_gene( with expectation: restructure.restructure_gene(gene) assert gen.get_sub_structure(gene) == {"gene": expected_children} + + +@pytest.mark.parametrize( + "children, expected_children", + [ + param("gene", "gene", id="gene"), + param("pseudogene", "pseudogene", id="pseudogene"), + param({"pseudogene": ["mRNA"]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA"), + param( + {"pseudogene": [{"mRNA": ["CDS", "CDS"]}]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA CDSs" + ), + param( + {"pseudogene": [{"mRNA": ["CDS", "exon"]}]}, + {"pseudogene": [{"mRNA": ["exon"]}]}, + id="pseudogene mRNA CDSs, exons", + ), + param({"pseudogene": ["CDS", "CDS"]}, "pseudogene", id="pseudogene CDSs"), + ], +) +def test_remove_cds_from_pseudogene(children: List[Any], expected_children: List[Any]) -> None: + """Test CDS removal from pseudogene.""" + gen = FeatGenerator() + gene = gen.make_structure([children])[0] + restructure.remove_cds_from_pseudogene(gene) + assert gen.get_sub_structure(gene) == expected_children diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py new file mode 100644 index 000000000..5abd728e5 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier.py @@ -0,0 +1,455 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.gff3.simplifier` module.""" + +from contextlib import nullcontext as does_not_raise +from os import PathLike +from pathlib import Path +from typing import Callable, ContextManager, Optional + +from Bio.SeqFeature import SeqFeature +import pytest +from pytest import param, raises + +from ensembl.io.genomio.gff3.exceptions import GFFParserError +from ensembl.io.genomio.gff3.simplifier import GFFSimplifier +from ensembl.io.genomio.gff3.exceptions import IgnoredFeatureError, UnsupportedFeatureError + + +def check_one_feature(input_gff: PathLike, output_gff: PathLike, check_function: str) -> None: + """Load 1 feature from a GFF, apply a function, then write it back to a GFF.""" + simp = GFFSimplifier() + simp.records.from_gff(input_gff) + # Get the only feature + feat = simp.records[0].features[0] + # Apply the named function + check_method = getattr(simp, check_function) + new_feat = check_method(feat) + # Put it back + if isinstance(new_feat, list): + simp.records[0].features = new_feat + else: + simp.records[0].features = [new_feat] + simp.records.to_gff(output_gff) + + +@pytest.mark.parametrize( + "in_gff, expected_gff", + [ + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("lone/transcript.gff", "lone/transcript_simped.gff", id="lone transcript"), + param("lone/trna.gff", "lone/trna_simped.gff", id="lone tRNA"), + param("lone/rrna.gff", "lone/rrna_simped.gff", id="lone rRNA"), + ], +) +def test_create_gene_for_lone_transcript( + tmp_path: Path, + data_dir: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, +) -> None: + """Test gene create gene for lone transcript.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + check_one_feature(input_gff, output_gff, "create_gene_for_lone_transcript") + assert_files(output_gff, Path(data_dir / expected_gff)) + + +@pytest.mark.parametrize( + "in_gff, expected_gff", + [ + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("lone/cds.gff", "lone/cds_simped.gff", id="lone CDS"), + param("lone/cds_pseudo.gff", "lone/cds_pseudo_simped.gff", id="lone pseudo CDS"), + ], +) +def test_create_gene_for_lone_cds( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, +) -> None: + """Test gene create gene for lone CDS.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + check_one_feature(input_gff, output_gff, "create_gene_for_lone_cds") + assert_files(output_gff, Path(data_dir / expected_gff)) + + +@pytest.mark.parametrize( + "in_type, in_mobile_type, in_product, out_type, out_description, expectation", + [ + param("gene", None, None, "gene", None, does_not_raise(), id="Gene, skip"), + param("transposable_element", None, None, "transposable_element", None, does_not_raise(), id="TE"), + param("mobile_genetic_element", None, None, "transposable_element", None, does_not_raise(), id="MGE"), + param( + "transposable_element", + "transposon", + None, + "transposable_element", + "transposon", + does_not_raise(), + id="MGE, transposon", + ), + param( + "transposable_element", + "transposon:LOREM", + None, + "transposable_element", + "transposon (LOREM)", + does_not_raise(), + id="MGE, transposon named", + ), + param( + "transposable_element", + "retrotransposon:LOREM", + None, + "transposable_element", + "retrotransposon (LOREM)", + does_not_raise(), + id="MGE, retrotransposon named", + ), + param( + "transposable_element", + "UNKNOWNtransposon:LOREM", + None, + "transposable_element", + None, + raises(GFFParserError), + id="MGE, unknown type", + ), + param( + "transposable_element", + "transposon", + "PROD", + "transposable_element", + "PROD", + does_not_raise(), + id="MGE, transposon, product exists", + ), + ], +) +def test_normalize_non_gene( + in_type: str, + in_mobile_type: Optional[str], + in_product: Optional[str], + out_type: str, + out_description: Optional[str], + expectation: ContextManager, +) -> None: + """Test non-gene normalization.""" + simp = GFFSimplifier() + feat = SeqFeature(None, in_type) + feat.qualifiers = {"source": "LOREM"} + if in_mobile_type is not None: + feat.qualifiers["mobile_element_type"] = [in_mobile_type] + if in_product is not None: + feat.qualifiers["product"] = [in_product] + feat.sub_features = [] + with expectation: + new_feat = simp.normalize_non_gene(feat) + if new_feat is not None: + assert new_feat.type == out_type + if out_description is not None: + assert new_feat.qualifiers == feat.qualifiers + + +def test_normalize_non_gene_not_implemented() -> None: + """Test non-gene not in the biotype list.""" + simp = GFFSimplifier() + simp._biotypes = {"non_gene": {"supported": ["non_gene_name"]}} # pylint: disable=protected-access + feat = SeqFeature(None, "non_gene_name") + with raises(NotImplementedError): + simp.normalize_non_gene(feat) + + +@pytest.mark.parametrize( + "in_type, name, out_type, expectation", + [ + param("mRNA", "", "mRNA", does_not_raise(), id="mRNA no change"), + param("C_gene_segment", "", "C_gene_segment", raises(GFFParserError), id="no standard name"), + param("C_gene_segment", "immunoglobulin", "IG_C_gene", does_not_raise(), id="C immunoglobulin"), + param("C_gene_segment", "ig", "IG_C_gene", does_not_raise(), id="C ig"), + param("V_gene_segment", "t-cell", "TR_V_gene", does_not_raise(), id="V t-cell"), + param("V_gene_segment", "T_cell", "TR_V_gene", does_not_raise(), id="V T_cell"), + param("V_gene_segment", "Lorem Ipsum", "", raises(GFFParserError), id="V T_cell"), + ], +) +def test_format_gene_segments( + in_type: str, + name: str, + out_type: str, + expectation: ContextManager, +) -> None: + """Test gene create gene for lone CDS.""" + simp = GFFSimplifier() + feat = SeqFeature(None, in_type) + if name: + feat.qualifiers["standard_name"] = [name] + feat.sub_features = [] + with expectation: + new_feat = simp.format_gene_segments(feat) + assert new_feat.type == out_type + + +@pytest.mark.parametrize( + "in_gff, expected_gff", + [ + param("ok_gene.gff", "ok_gene.gff", id="ok gene"), + param("clean/extra.gff", "clean/extra_clean.gff", id="ok gene with extra attribs"), + ], +) +def test_clean_gene( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, +) -> None: + """Test clean gene.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + check_one_feature(input_gff, output_gff, "clean_gene") + assert_files(output_gff, Path(data_dir / expected_gff)) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("gene_ignored.gff", None, raises(IgnoredFeatureError), id="gene ignored"), + param("gene_unsupported.gff", None, raises(UnsupportedFeatureError), id="gene unsupported"), + param("mobile_te.gff", "mobile_te.gff", does_not_raise(), id="TE"), + param("ok_protein_coding_gene.gff", "ok_gene.gff", does_not_raise(), id="ok protein_coding_gene"), + param( + "ok_tr_ignored.gff", + "ok_gene.gff", + does_not_raise(), + id="ok gene with ignored transcripts/subtranscripts", + ), + ], +) +def test_simpler_gff3_feature( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: Optional[PathLike], + expectation: ContextManager, +) -> None: + """Test simplifying one gene (from a GFF3 file).""" + input_gff = data_dir / in_gff + output_gff = tmp_path / in_gff + with expectation: + check_one_feature(input_gff, output_gff, "simpler_gff3_feature") + if expected_gff is not None: + assert_files(output_gff, Path(data_dir / expected_gff)) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="ok gene"), + param("bad_gene_type.gff", "", raises(GFFParserError), id="Unsupported gene type"), + param("bad_tr_type.gff", "", raises(GFFParserError), id="Unsupported transcript type"), + param("bad_subtr_type.gff", "", raises(GFFParserError), id="Unsupported subtranscript type"), + ], +) +def test_simpler_gff3( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + simp = GFFSimplifier() + with expectation: + simp.simpler_gff3(input_gff) + if expected_gff: + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, allow_cds", + [ + param("ok_gene.gff", "ok_gene.gff", False, id="ok gene"), + param("ok_gene.gff", "ok_gene.gff", True, id="ok gene, allow pseudo CDS"), + param("pseudogene.gff", "pseudogene.gff", False, id="ok pseudogene"), + param("pseudogene_cds.gff", "pseudogene_cds_removed.gff", False, id="pseudogene cds removed"), + param("pseudogene_cds.gff", "pseudogene_cds.gff", True, id="pseudogene cds kept"), + ], +) +def test_simpler_gff3_pseudogene( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + allow_cds: bool, +) -> None: + """Test simplifying pseudogenes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + simp = GFFSimplifier() + simp.allow_pseudogene_with_cds = allow_cds + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, skip_unrecognized, expectation", + [ + param("bad_gene_type.gff", "", False, raises(GFFParserError), id="Unset skip unrecognized, fail"), + param( + "bad_gene_type.gff", + "bad_gene_type_skipped.gff", + True, + does_not_raise(), + id="True skip unrecognized, no fail", + ), + param("bad_gene_type.gff", "", False, raises(GFFParserError), id="bad type, fail"), + param("ok_gene.gff", "ok_gene.gff", False, does_not_raise(), id="ok type, fail"), + ], +) +def test_simpler_gff3_skip( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + skip_unrecognized: bool, + expectation: ContextManager, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / in_gff + simp = GFFSimplifier(skip_unrecognized=skip_unrecognized) + with expectation: + simp.simpler_gff3(input_gff) + if expected_gff: + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + + +@pytest.mark.parametrize( + "genome_file, in_gff, expected_gff", + [ + param( + None, + "genes_badnames.gff", + "genes_badnames_noname.gff", + id="Genes with bad names, no genome", + ), + param( + "genome_no_brc4.json", + "genes_badnames.gff", + "genes_badnames_noname.gff", + id="Genes with bad names, genome not BRC4", + ), + param( + "genome_brc4.json", + "genes_badnames.gff", + "genes_badnames_brc4name.gff", + id="Genes with bad names, genome BRC4", + ), + ], +) +def test_gffsimplifier_with_genome( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + genome_file: Optional[PathLike], + in_gff: PathLike, + expected_gff: PathLike, +) -> None: + """Test simplifying genes from GFF3 files.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / in_gff + if genome_file is None: + simp = GFFSimplifier() + else: + simp = GFFSimplifier(genome_path=data_dir / genome_file) + simp.simpler_gff3(input_gff) + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) + + +@pytest.mark.parametrize( + "in_gff, expected_gff, expectation", + [ + param("ok_gene.gff", "ok_gene.gff", does_not_raise(), id="normal gene"), + param( + "mirna/gene.gff", + "mirna/gene_simped.gff", + does_not_raise(), + id="gene + primary_transcript + miRNA", + ), + param( + "mirna/pseudogene.gff", + "mirna/pseudogene_simped.gff", + does_not_raise(), + id="gene + primary_transcript - miRNA", + ), + param( + "mirna/nogene.gff", + "mirna/nogene_simped.gff", + does_not_raise(), + id="primary_transcript + miRNA", + ), + param( + "mirna/pseudo_nogene.gff", + "mirna/pseudo_nogene_simped.gff", + does_not_raise(), + id="primary_transcript - miRNA", + ), + param( + "mirna/unsupported_tr.gff", + "", + raises(GFFParserError, match="Unknown subtype"), + id="gene + primary_transcript + mRNA, not supported", + ), + param( + "mirna/two_primary.gff", + "", + raises(GFFParserError, match="too many sub_features"), + id="gene + 2x primary_transcript, not supported", + ), + ], +) +def test_simpler_gff3_mirna( + data_dir: Path, + tmp_path: Path, + assert_files: Callable, + in_gff: PathLike, + expected_gff: PathLike, + expectation: ContextManager, +) -> None: + """Test normalizing miRNA genes.""" + input_gff = data_dir / in_gff + output_gff = tmp_path / Path(in_gff).name + simp = GFFSimplifier() + with expectation: + simp.simpler_gff3(input_gff) + if expected_gff: + simp.records.to_gff(output_gff) + assert_files(output_gff, data_dir / expected_gff) diff --git a/src/python/tests/gff3/test_simplifier/bad_gene_type.gff b/src/python/tests/gff3/test_simplifier/bad_gene_type.gff new file mode 100644 index 000000000..652047cef --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_gene_type.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff b/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff new file mode 100644 index 000000000..710a1cea7 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_gene_type_skipped.gff @@ -0,0 +1,2 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 diff --git a/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff b/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff new file mode 100644 index 000000000..1c478ca1a --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_subtr_type.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source str_lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/bad_tr_type.gff b/src/python/tests/gff3/test_simplifier/bad_tr_type.gff new file mode 100644 index 000000000..c5f39772c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/bad_tr_type.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source tr_lorem_ipsum 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/clean/extra.gff b/src/python/tests/gff3/test_simplifier/clean/extra.gff new file mode 100644 index 000000000..9d82141f8 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/clean/extra.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1;extra_field=Lorem; +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;extra_field=Lorem;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . extra_field=Lorem;ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;extra_field=Lorem;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff b/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff new file mode 100644 index 000000000..943179766 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/clean/extra_clean.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/gene_ignored.gff b/src/python/tests/gff3/test_simplifier/gene_ignored.gff new file mode 100644 index 000000000..f1d3e2cff --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/gene_ignored.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source region 1 1000 . - . ID=1 diff --git a/src/python/tests/gff3/test_simplifier/gene_unsupported.gff b/src/python/tests/gff3/test_simplifier/gene_unsupported.gff new file mode 100644 index 000000000..5ba5d05f3 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/gene_unsupported.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source loremipsum 1 1000 . - . ID=1 diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames.gff b/src/python/tests/gff3/test_simplifier/genes_badnames.gff new file mode 100644 index 000000000..6c717d3a9 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=L1 +scaffold1 Source mRNA 1 1000 . - . ID=L1t;Parent=L1 +scaffold1 Source exon 1 1000 . - . ID=L1t1;Parent=L1t +scaffold1 Source CDS 100 900 . - 0 ID=L1tc;Parent=L1t diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff b/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff new file mode 100644 index 000000000..1f4a37b05 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames_brc4name.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_pfal3D7_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_pfal3D7_1_t1;Parent=TMP_pfal3D7_1 +scaffold1 Source exon 1 1000 . - . ID=TMP_pfal3D7_1_t1-E1;Parent=TMP_pfal3D7_1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=TMP_pfal3D7_1_t1_cds;Parent=TMP_pfal3D7_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff b/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff new file mode 100644 index 000000000..655fb9b15 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genes_badnames_noname.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source exon 1 1000 . - . ID=TMP_PREFIX_1_t1-E1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=TMP_PREFIX_1_t1_cds;Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/genome_brc4.json b/src/python/tests/gff3/test_simplifier/genome_brc4.json new file mode 100644 index 000000000..77d2f895d --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genome_brc4.json @@ -0,0 +1,20 @@ +{ + "BRC4": { + "component": "PlasmoDB", + "organism_abbrev": "pfal3D7" + }, + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file diff --git a/src/python/tests/gff3/test_simplifier/genome_no_brc4.json b/src/python/tests/gff3/test_simplifier/genome_no_brc4.json new file mode 100644 index 000000000..1cf896c30 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/genome_no_brc4.json @@ -0,0 +1,16 @@ +{ + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file diff --git a/src/python/tests/gff3/test_simplifier/lone/cds.gff b/src/python/tests/gff3/test_simplifier/lone/cds.gff new file mode 100644 index 000000000..385f01ecb --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/cds.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/cds_pseudo.gff b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo.gff new file mode 100644 index 000000000..c87e997c8 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;pseudo=true diff --git a/src/python/tests/gff3/test_simplifier/lone/cds_pseudo_simped.gff b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo_simped.gff new file mode 100644 index 000000000..1d4d03466 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/cds_pseudo_simped.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source exon 1 1000 . - . Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/lone/cds_simped.gff b/src/python/tests/gff3/test_simplifier/lone/cds_simped.gff new file mode 100644 index 000000000..30514b4af --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/cds_simped.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=TMP_PREFIX_1 +scaffold1 Source mRNA 1 1000 . - . ID=TMP_PREFIX_1_t1;Parent=TMP_PREFIX_1 +scaffold1 Source CDS 1 1000 . - 0 ID=LOREMIPSUM1;Parent=TMP_PREFIX_1_t1 +scaffold1 Source exon 1 1000 . - . Parent=TMP_PREFIX_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/lone/rrna.gff b/src/python/tests/gff3/test_simplifier/lone/rrna.gff new file mode 100644 index 000000000..d0f856236 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/rrna.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source rRNA 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/rrna_simped.gff b/src/python/tests/gff3/test_simplifier/lone/rrna_simped.gff new file mode 100644 index 000000000..1179c2235 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/rrna_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source rRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/transcript.gff b/src/python/tests/gff3/test_simplifier/lone/transcript.gff new file mode 100644 index 000000000..878661029 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/transcript.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transcript 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/transcript_simped.gff b/src/python/tests/gff3/test_simplifier/lone/transcript_simped.gff new file mode 100644 index 000000000..56c6348e7 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/transcript_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source ncRNA_gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/trna.gff b/src/python/tests/gff3/test_simplifier/lone/trna.gff new file mode 100644 index 000000000..72f74fb1c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/trna.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source tRNA 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/lone/trna_simped.gff b/src/python/tests/gff3/test_simplifier/lone/trna_simped.gff new file mode 100644 index 000000000..0ed4ebca4 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/lone/trna_simped.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source tRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/gene.gff b/src/python/tests/gff3/test_simplifier/mirna/gene.gff new file mode 100644 index 000000000..a447b2c7f --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/gene.gff @@ -0,0 +1,7 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1p;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1M;Parent=LOREMIPSUM1p +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1p_t1;Parent=LOREMIPSUM1p +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1;Parent=LOREMIPSUM1p_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff new file mode 100644 index 000000000..b45083490 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/gene_simped.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_1_t1;Parent=LOREMIPSUM1_1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_1_t1-E1;Parent=LOREMIPSUM1_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/nogene.gff b/src/python/tests/gff3/test_simplifier/mirna/nogene.gff new file mode 100644 index 000000000..b4bb721c6 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/nogene.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/nogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/nogene_simped.gff new file mode 100644 index 000000000..487283c45 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/nogene_simped.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_1 +scaffold1 Source miRNA 1 1000 . - . ID=LOREMIPSUM1_1_t1;Parent=LOREMIPSUM1_1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_1_t1-E1;Parent=LOREMIPSUM1_1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene.gff new file mode 100644 index 000000000..5a4f0c998 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 \ No newline at end of file diff --git a/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene_simped.gff new file mode 100644 index 000000000..502aa673c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/pseudo_nogene_simped.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1_0 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_0_t1;Parent=LOREMIPSUM1_0 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_0_t1-E1;Parent=LOREMIPSUM1_0_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/pseudogene.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudogene.gff new file mode 100644 index 000000000..76e01bb16 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/pseudogene.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1p;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1p_t1-E1M;Parent=LOREMIPSUM1p diff --git a/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff b/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff new file mode 100644 index 000000000..a552d5ca6 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/pseudogene_simped.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/two_primary.gff b/src/python/tests/gff3/test_simplifier/mirna/two_primary.gff new file mode 100644 index 000000000..63dd90143 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/two_primary.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1_t2;Parent=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/mirna/unsupported_tr.gff b/src/python/tests/gff3/test_simplifier/mirna/unsupported_tr.gff new file mode 100644 index 000000000..684525919 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mirna/unsupported_tr.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source primary_transcript 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1M;Parent=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/mobile_te.gff b/src/python/tests/gff3/test_simplifier/mobile_te.gff new file mode 100644 index 000000000..6b9937360 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/mobile_te.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREM_IPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff new file mode 100644 index 000000000..495ad3731 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source mobile_genetic_element 1 1000 . - . ID=LOREMIPSUM1;gbkey=mobile_element;mobile_element_type=transposon diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_1name_simped.gff3 @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff new file mode 100644 index 000000000..65952b0a0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source mobile_genetic_element 1 1000 . - . ID=LOREMIPSUM1;gbkey=mobile_element;mobile_element_type=transposon:rnd-1_family-319/TvMULE1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/mobile_full_simped.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/non_gene/te.gff b/src/python/tests/gff3/test_simplifier/non_gene/te.gff new file mode 100644 index 000000000..128627f25 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/non_gene/te.gff @@ -0,0 +1,3 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source transposable_element 1 1000 . - . ID=LOREMIPSUM1 diff --git a/src/python/tests/gff3/test_simplifier/ok_gene.gff b/src/python/tests/gff3/test_simplifier/ok_gene.gff new file mode 100644 index 000000000..943179766 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_gene.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff b/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff new file mode 100644 index 000000000..6c1b6d2c3 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_protein_coding_gene.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source protein_coding_gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1_exon1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff b/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff new file mode 100644 index 000000000..916081424 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/ok_tr_ignored.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source gene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source five_prime_UTR 900 1000 . - 0 ID=5pID;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 +scaffold1 Source three_prime_UTR 900 1000 . - 0 ID=3pID;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/pseudogene.gff b/src/python/tests/gff3/test_simplifier/pseudogene.gff new file mode 100644 index 000000000..a6d419cb0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff b/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff new file mode 100644 index 000000000..48d5ba95c --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene_cds.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1 +scaffold1 Source CDS 100 900 . - 0 ID=LOREMIPSUM1_t1_cds;Parent=LOREMIPSUM1_t1 diff --git a/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff b/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff new file mode 100644 index 000000000..a6d419cb0 --- /dev/null +++ b/src/python/tests/gff3/test_simplifier/pseudogene_cds_removed.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region scaffold1 1 1000 +scaffold1 Source pseudogene 1 1000 . - . ID=LOREMIPSUM1 +scaffold1 Source mRNA 1 1000 . - . ID=LOREMIPSUM1_t1;Parent=LOREMIPSUM1 +scaffold1 Source exon 1 1000 . - . ID=LOREMIPSUM1_t1-E1;Parent=LOREMIPSUM1_t1