diff --git a/parsers/SIGNOR/src/loadSIGNOR.py b/parsers/SIGNOR/src/loadSIGNOR.py index 446b1d1..6a04efc 100644 --- a/parsers/SIGNOR/src/loadSIGNOR.py +++ b/parsers/SIGNOR/src/loadSIGNOR.py @@ -8,13 +8,12 @@ from requests_toolbelt.multipart.encoder import MultipartEncoder -from Common.extractor import Extractor from Common.biolink_constants import * from Common.prefixes import * from Common.utils import GetData from Common.loader_interface import SourceDataLoader -from parsers.SIGNOR.src.signor_mechanism_predicate_mapping import * +from parsers.SIGNOR.src.signor_mechanism_predicate_mapping import ptm_dict, mechanism_map, effect_mapping class DATACOLS(enum.IntEnum): @@ -70,7 +69,7 @@ class SIGNORLoader(SourceDataLoader): license = ("SIGNOR is licensed under a Creative Commons Attribution-NonCommercial 4.0 International " "(CC BY-NC 4.0) license.") attribution = 'https://signor.uniroma2.it/about/' - parsing_version = '1.3' + parsing_version = '1.4' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -92,10 +91,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.signor_pathways_filename = "SIGNOR-pathways.tsv" self.signor_mechanisms_filename = "SIGNOR-mechanisms.csv" self.signor_modifications_filename = "SIGNOR-modifications.csv" - self.signor_data_file = "signor_data.json" - - self.signor_version = self.get_latest_source_version() - self.signor_file_name = "getLatestRelease.php" + self.signor_file_name = "signor_latest.tsv" self.data_files = [self.signor_file_name, self.signor_phenotypes_filename, self.signor_stimuli_filename, @@ -103,8 +99,11 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.signor_proteinfamily_filename, self.signor_pathways_filename, self.signor_mechanisms_filename, - self.signor_modifications_filename - ] + self.signor_modifications_filename] + + # this is not a source file but a mapping we create, we write it to file for later perusal + self.signor_type_map_file = f"signor_data_{self.parsing_version}.json" + self.signor_type_map = None def get_latest_source_version(self) -> str: """ @@ -126,71 +125,64 @@ def get_data(self) -> int: which is why MultipartEncoder is used. """ + # self.signor_phenotypes_filename + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download phenotype data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_phenotypes_filename), 'wb') as f: + f.write(response.content) + + # self.signor_stimuli_filename + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download stimulus data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_stimuli_filename), 'wb') as f: + f.write(response.content) + + # self.signor_complex_filename + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download complex data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_complex_filename), 'wb') as f: + f.write(response.content) + + # self.signor_proteinfamily_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download protein family data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_proteinfamily_filename), 'wb') as f: + f.write(response.content) + + # self.signor_mechanisms_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Mechansims CV")}) + # Mechanism is misspelled on the SIGNOR website. If they fix their spelling, this will break + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_mechanisms_filename), 'wb') as f: + f.write(response.content) + + # self.signor_modifications_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Modifications CV")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_modifications_filename), 'wb') as f: + f.write(response.content) + + # self.signor_pathways_filename: + mp_encoder = MultipartEncoder(fields={'format': 'include SIGNOR entities', + 'submit': 'Download GMT File (all Pathways)' + }) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_pathways_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_pathways_filename), 'wb') as f: + f.write(response.content) + + # self.signor_file_name: data_puller = GetData() - file_count = 0 - for source in self.data_files: - if source == self.signor_phenotypes_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download phenotype data")}) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_phenotypes_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_stimuli_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download stimulus data")}) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_stimuli_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_complex_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download complex data")}) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_complex_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_proteinfamily_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download protein family data")}) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_proteinfamily_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_mechanisms_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Mechansims CV")}) - # Mechanism is misspelled on the SIGNOR website. If they fix their spelling, this will break - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_mechanisms_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_modifications_filename: - mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Modifications CV")}) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_modifications_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_pathways_filename: - mp_encoder = MultipartEncoder(fields={'format': 'include SIGNOR entities', - 'submit': 'Download GMT File (all Pathways)' - }) - headers = {'Content-Type': mp_encoder.content_type} - response = rq.post(self.signor_pathways_download, headers=headers, data=mp_encoder) - with open(os.path.join(self.data_path, self.signor_pathways_filename), 'wb') as f: - f.write(response.content) - - elif source == self.signor_file_name: - data_puller.pull_via_http(self.signor_data_url, self.data_path) - - file_count += 1 - - self.make_datafile() - - return file_count - - def make_datafile(self): + data_puller.pull_via_http(self.signor_data_url, self.data_path, saved_file_name=self.signor_file_name) + return len(self.data_files) + + def make_signor_type_map(self): """ This function makes the data file which is a json file with all the SIGNOR data types laid out. This file can then be used later to make SIGNOR type Entities to their respective information. @@ -283,57 +275,42 @@ def make_datafile(self): # modifications and mechanisms to known CURIES: ie HP or REACTOME. For now, its usuable. # Write the list of dictionaries to a JSON file for reference during edge creation - with open(os.path.join(self.data_path, self.signor_data_file), mode='w') as outfile: + with open(os.path.join(self.data_path, self.signor_type_map_file), mode='w') as outfile: json.dump(signordata, outfile, indent=4) + return signordata - def node_data_mapping(self, line): - - def get_node(database, identifier): - """adds the correct prefixes to the subject and objects""" - database_prefix_map = { - "UNIPROT": UNIPROTKB, - "PUBCHEM": PUBCHEM_COMPOUND, - "RNAcentral": RNACENTRAL, - "DRUGBANK": DRUGBANK - } - - if database == "PUBCHEM": - # Remove prefix from PUBCHEM IDs in SIGNOR - return f"{database_prefix_map.get(database)}:{identifier.replace('CID:', '')}" - - if database == "UNIPROT" and len(identifier.split("-PRO_")) > 1: - # Remove suffix from UNIPROT IDs in SIGNOR - # These suffixes are specific regions of the gene/protein and will be added to X_part_qualifier - return f"{database_prefix_map.get(database)}:{identifier.split('-PRO_')[0]}" - - node = f"{database_prefix_map.get(database)}:{identifier}" if database in database_prefix_map else None - return node - - def signor_node_mapping(type, identifier): - """maps the SIGNOR ID to the GO_TERM if the available.""" - with open(os.path.join(self.data_path, self.signor_data_file), 'r') as file: - data = json.load(file) - - # Search for the entry with the specified SIGNOR ID - for entry in data.get(type, []): - if entry.get("SIGNOR ID") == identifier: - go_term = entry.get("GO_TERM") - if go_term: - return go_term # Return the GO_TERM if found + @staticmethod + def fix_node_curie_prefix(database, identifier): + """adds the correct prefixes to the subject and objects""" + database_prefix_map = { + "UNIPROT": UNIPROTKB, + "PUBCHEM": PUBCHEM_COMPOUND, + "RNAcentral": RNACENTRAL, + "DRUGBANK": DRUGBANK + } - # Mapping for subject and object - subject_node = get_node(line[DATACOLS.SUBJECT_DATABASE.value], line[DATACOLS.SUBJECT_ID.value]) - object_node = get_node(line[DATACOLS.OBJECT_DATABASE.value], line[DATACOLS.OBJECT_ID.value]) + if database == "PUBCHEM": + # Remove prefix from PUBCHEM IDs in SIGNOR + return f"{database_prefix_map.get(database)}:{identifier.replace('CID:', '')}" - # - if line[DATACOLS.SUBJECT_DATABASE.value] == "SIGNOR": - subject_node = signor_node_mapping(line[DATACOLS.SUBJECT_TYPE.value], line[DATACOLS.SUBJECT_ID.value]) + if database == "UNIPROT" and len(identifier.split("-PRO_")) > 1: + # Remove suffix from UNIPROT IDs in SIGNOR + # These suffixes are specific regions of the gene/protein and will be added to X_part_qualifier + return f"{database_prefix_map.get(database)}:{identifier.split('-PRO_')[0]}" - if line[DATACOLS.OBJECT_DATABASE.value] == "SIGNOR": - object_node = signor_node_mapping(line[DATACOLS.OBJECT_TYPE.value], line[DATACOLS.OBJECT_ID.value]) + node = f"{database_prefix_map.get(database)}:{identifier}" if database in database_prefix_map else None + return node - return subject_node, object_node + def signor_node_mapping(self, node_type, identifier): + """maps the SIGNOR ID to the GO_TERM if the available.""" + # Search for the entry with the specified SIGNOR ID + for entry in self.signor_type_map.get(node_type, []): + if entry.get("SIGNOR ID") == identifier: + go_term = entry.get("GO_TERM") + if go_term: + return go_term # Return the GO_TERM if found + return None @staticmethod def get_anatomical_context(line): @@ -351,7 +328,7 @@ def get_taxon(line): returns None if taxon id is invalid """ taxon_value = line[DATACOLS.TAXON.value] - return [f"NCBITaxon:{taxon_value}"] if taxon_value not in ["", "-1"] else None + return f"NCBITaxon:{taxon_value}" if taxon_value not in ["", "-1"] else None @staticmethod def get_part_qualifier(line): @@ -360,15 +337,14 @@ def get_part_qualifier(line): """ def get_part(database, identifier): if database == "UNIPROT" and len(identifier.split("-PRO_")) > 1: - return [identifier.split('-')[1]] + return identifier.split('-')[1] subject_part_qualifier = get_part(line[DATACOLS.SUBJECT_DATABASE.value], line[DATACOLS.SUBJECT_ID.value]) object_part_qualifier = get_part(line[DATACOLS.OBJECT_DATABASE.value], line[DATACOLS.OBJECT_ID.value]) - return subject_part_qualifier, object_part_qualifier @staticmethod - def edge_properties_from_mechanism(line, effect, predicate): + def edge_predicate_from_mechanism_effect(line, effect): """ get the edge properties from the SIGNOR mechanisms/effects """ @@ -385,19 +361,22 @@ def edge_properties_from_mechanism(line, effect, predicate): QUALIFIED_PREDICATE: "RO:0003303", # causes OBJECT_DIRECTION_QUALIFIER: direction_qualifier, OBJECT_ASPECT_QUALIFIER: effect, - OBJECT_PART_QUALIFIER: [line[DATACOLS.AA_MODIFIED.value]] if line[DATACOLS.AA_MODIFIED.value] else None + OBJECT_PART_QUALIFIER: line[DATACOLS.AA_MODIFIED.value] if line[DATACOLS.AA_MODIFIED.value] else None } # other mechanisms - predicate = mechanism_map.get(effect, {}).get("predicate", predicate) - edge_properties = mechanism_map.get(effect, {}).get("edge_properties", {}) - - return predicate, edge_properties + predicate = mechanism_map.get(effect, {}).get("predicate", None) + edge_qualifiers = mechanism_map.get(effect, {}).get("edge_properties", None) + return predicate, edge_qualifiers def get_basic_edge_properties(self, line): """ define basic edge properties for all edges """ + + # we may need to split the edge into multiple edges, if so append the different sets of properties to this list + split_edge_properties = [] + edge_properties = { PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, @@ -405,44 +384,34 @@ def get_basic_edge_properties(self, line): PUBLICATIONS: ['PMID:' + line[DATACOLS.PUBLICATIONS.value]], DESCRIPTION: [line[DATACOLS.DESCRIPTION.value]], SPECIES_CONTEXT_QUALIFIER: self.get_taxon(line), - ANATOMICAL_CONTEXT_QUALIFIER: self.get_anatomical_context(line), SUBJECT_PART_QUALIFIER: self.get_part_qualifier(line)[0] if self.get_part_qualifier(line)[0] else None, OBJECT_PART_QUALIFIER: self.get_part_qualifier(line)[1] if self.get_part_qualifier(line)[1] else None } - return edge_properties - - def create_and_parse_edge(self, row, extractor, predicate="biolink:related_to", - edge_properties=None, mechanism=None): - """ - Creates predicates and edge properties for a row - based on the effects and mechanisms in SIGNOR - """ - - # Default Edge Properties - basic_edge_properties = self.get_basic_edge_properties(row) - if mechanism: - predicate, mechanism_edge_properties = self.edge_properties_from_mechanism(row, mechanism, predicate) - # Add mechanism specific edge properties to the basic edge properties - edge_properties = basic_edge_properties | mechanism_edge_properties - - if edge_properties: - # Add basic edge properties to effect specific edge properties - edge_properties.update(basic_edge_properties) + # remove empty or null keys (avoiding qualifier=None) + for key in list(edge_properties.keys()): + if not edge_properties[key]: + del(edge_properties[key]) + + anatomical_contexts = self.get_anatomical_context(line) + if anatomical_contexts: + # anatomical_contexts is a list that may be > 1, make a new set of edge properties for each entry + for anatomical_context in anatomical_contexts: + new_edge_properties = edge_properties.copy() + new_edge_properties[ANATOMICAL_CONTEXT_QUALIFIER] = anatomical_context + split_edge_properties.append(new_edge_properties) else: - edge_properties = basic_edge_properties + split_edge_properties = [edge_properties] + return split_edge_properties + + def get_converted_node_id(self, node_id, node_type, node_database): - extractor.parse_row( - row, - subject_extractor=lambda line: self.node_data_mapping(line)[0], - object_extractor=lambda line: self.node_data_mapping(line)[1], - predicate_extractor=lambda line: predicate, - subject_property_extractor=None, - object_property_extractor=None, - edge_property_extractor=lambda line: edge_properties - ) + # if it's from SIGNOR, look up the proper ID using our mappings + if node_database == "SIGNOR": + return self.signor_node_mapping(node_type, node_id) - return predicate, edge_properties + # otherwise make sure the curie prefix is correct and return the curie + return self.fix_node_curie_prefix(node_database, node_id) def parse_data(self) -> dict: """ @@ -450,38 +419,66 @@ def parse_data(self) -> dict: :return: ret_val: load_metadata """ - extractor = Extractor(file_writer=self.output_file_writer) - + self.signor_type_map = self.make_signor_type_map() + input_rows = 0 + skipped_rows = 0 with open(os.path.join(self.data_path, self.signor_file_name)) as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') next(reader) for row in reader: + input_rows += 1 + subject_id = self.get_converted_node_id(node_id=row[DATACOLS.SUBJECT_ID.value], + node_type=row[DATACOLS.SUBJECT_TYPE.value], + node_database=row[DATACOLS.SUBJECT_DATABASE.value]) + object_id = self.get_converted_node_id(node_id=row[DATACOLS.OBJECT_ID.value], + node_type=row[DATACOLS.OBJECT_TYPE.value], + node_database=row[DATACOLS.OBJECT_DATABASE.value]) + if not (subject_id and object_id): + skipped_rows += 1 + continue + + self.output_file_writer.write_node(subject_id) + self.output_file_writer.write_node(object_id) effect = row[DATACOLS.EFFECT.value] mechanism = row[DATACOLS.MECHANISM.value] - if effect in effect_mapping.keys(): - # Handle edge from mechanism - if mechanism: - self.create_and_parse_edge(row, extractor, mechanism=mechanism) - - for predicate in effect_mapping[effect].keys(): - edge_properties = effect_mapping[effect][predicate] - - # Final edge creation - if mechanism: - # Handle edge from mechanism - self.create_and_parse_edge(row, extractor, predicate=predicate, - edge_properties=edge_properties, mechanism=mechanism) - else: - self.create_and_parse_edge(row, extractor, predicate=predicate, - edge_properties=edge_properties) - # Handle unknown effect case - elif effect == "unknown" and mechanism: - self.create_and_parse_edge(row, extractor, mechanism=mechanism) - - else: - self.create_and_parse_edge(row, extractor) - - return extractor.load_metadata + if mechanism: + mechanism_predicate, mechanism_edge_qualifiers = \ + self.edge_predicate_from_mechanism_effect(row, effect=effect) + + # basic_edge_properties is actually a list of property dictionaries, + # because we will split edges that have multiple qualifiers of the same type + basic_edge_properties = self.get_basic_edge_properties(row) + for edge_properties in basic_edge_properties: + + # if there are mechanism qualifiers always add them + if mechanism_edge_qualifiers: + edge_properties |= mechanism_edge_qualifiers + + # if there is a mechanism predicate make an edge with it + if mechanism_predicate: + self.output_file_writer.write_edge(subject_id=subject_id, + predicate=mechanism_predicate, + object_id=object_id, + edge_properties=edge_properties) + + # make edges for any predicates mapped in effect_mapping, + # with the qualifiers in effect_mapping plus the mechanism qualifiers + if effect in effect_mapping: + for predicate, qualifiers in effect_mapping[effect].items(): + edge_properties |= qualifiers + self.output_file_writer.write_edge(subject_id=subject_id, + predicate=predicate, + object_id=object_id, + edge_properties=edge_properties) + else: + # neither effect or effect/mechanism mapped to a predicate + self.output_file_writer.write_edge(subject_id=subject_id, + predicate="biolink:related_to", + object_id=object_id, + edge_properties=edge_properties) + + return {'num_source_lines': input_rows, + 'unusable_source_lines': skipped_rows} diff --git a/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py b/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py index dec69e1..fe0944b 100644 --- a/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py +++ b/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py @@ -71,9 +71,9 @@ # This probably needs to be a node property. "guanine nucleotide exchange factor": { - "edge_properties": { - CAUSAL_MECHANISM_QUALIFIER: "guanyl_nucleotide_exchange" - } + "edge_properties": { + CAUSAL_MECHANISM_QUALIFIER: "guanyl_nucleotide_exchange" + } }, "post transcriptional modification": {