From 0aac6300691f775ea38f5da1b0db33cdf2f6a594 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Thu, 12 Dec 2024 18:01:46 -0800 Subject: [PATCH 1/2] For geneontology/pipeline#408 --- ontobio/io/assocparser.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index ccb1bed8..f4b9f369 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -800,9 +800,11 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None): return False (id_prefix, right) = id.split(":", maxsplit=1) + mgi_id = None if right.startswith("MGI:"): ## See ticket https://github.com/geneontology/go-site/issues/91 ## For purposes of determining allowed IDs in DB XREF, MGI IDs shall look like `MGI:12345` + mgi_id = right right = right[4:] if id_prefix == "" or right == "": @@ -830,9 +832,16 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None): if regex.fullmatch(right): identity_matches_pattern = True break - if identity_matches_pattern == False: + # check syntax for mgi using id instead of internal representation + if mgi_id is not None and regex.fullmatch(mgi_id): + identity_matches_pattern = True + break + if identity_matches_pattern == False and mgi_id is None: self.report.warning(line.line, Report.INVALID_ID, id, "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27) + elif identity_matches_pattern == False and mgi_id is not None: + self.report.warning(line.line, Report.INVALID_ID, id, + "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(mgi_id, id_prefix), taxon=line.taxon, rule=27) else: self.report.warning(line.line, Report.INVALID_ID, id, "GORULE:0000027: {} not found in list of database names in dbxrefs".format(id_prefix), taxon=line.taxon, rule=27) From 69e0b3704a4425055b72a55a32142d75bd50caa1 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Fri, 13 Dec 2024 14:07:07 -0800 Subject: [PATCH 2/2] For geneontology/pipeline#408 --- tests/test_gafparser.py | 23 +++++++++++++++-- tests/test_gpad_parser.py | 52 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/tests/test_gafparser.py b/tests/test_gafparser.py index a9dc907d..02f6c059 100644 --- a/tests/test_gafparser.py +++ b/tests/test_gafparser.py @@ -633,7 +633,11 @@ def test_id_syntax(): database_id_syntax_lookups['PomBase'] = pombase_types wb_ref_types = {} - database_id_syntax_lookups['WB_REF'] = wb_ref_types + database_id_syntax_lookups['WB_REF'] = wb_ref_types + + mgi_types = {} + mgi_types['entity'] = re.compile('MGI:[0-9]{5,}') + database_id_syntax_lookups['MGI'] = mgi_types p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) @@ -648,7 +652,13 @@ def test_id_syntax(): assert len(assoc_result.associations) == 1 assert assoc_result.skipped == False messages = p.report.to_report_json()["messages"] - assert "gorule-0000027" not in messages + assert "gorule-0000027" not in messages + + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tWB_REF:WBPaper00006408|PMID:18422602\tIPI\tMGI:MGI:1298204\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) @@ -686,6 +696,15 @@ def test_id_syntax(): assert len(messages["gorule-0000027"]) == 1 assert messages["gorule-0000027"][0]["obj"] == "BLA:18422602" + p = GafParser(config=assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tWB_REF:WBPaper00006408|PMID:18422602\tIPI\tMGI:1298204\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "MGI:1298204" + def test_gaf_gpi_bridge(): gaf = ["MGI", "MGI:1923503", "0610006L08Rik", "enables", "GO:0003674", "MGI:MGI:2156816|GO_REF:0000015", "ND", "", diff --git a/tests/test_gpad_parser.py b/tests/test_gpad_parser.py index c38a727d..d19592d5 100644 --- a/tests/test_gpad_parser.py +++ b/tests/test_gpad_parser.py @@ -345,6 +345,10 @@ def test_id_syntax(): pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?') database_id_syntax_lookups['PomBase'] = pombase_types + mgi_types = {} + mgi_types['entity'] = re.compile('MGI:[0-9]{5,}') + database_id_syntax_lookups['MGI'] = mgi_types + eco_types = {} eco_types['entity'] = re.compile(pattern) database_id_syntax_lookups['ECO'] = eco_types @@ -396,7 +400,31 @@ def test_id_syntax(): assert len(result.associations) == 1 assert result.skipped == False messages = p.report.to_report_json()["messages"] - assert "gorule-0000027" not in messages + assert "gorule-0000027" not in messages + + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "MGI:MGI:1298204", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + + config = assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ALT_ID_ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups) + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages + vals = ["PomBase", "SPAC25A8.01c", @@ -487,6 +515,28 @@ def test_id_syntax(): assert len(messages["gorule-0000027"]) == 1 assert messages["gorule-0000027"][0]["obj"] == "BLA:15494018" + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "MGI:15494018", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "MGI:15494018" + + def test_gpi_check(): report = assocparser.Report(group="unknown", dataset="unknown") vals = [