From 4e599a27dae9ded97d9db8552a9b20b06ee34dee Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Tue, 21 Jan 2025 02:47:31 -0500 Subject: [PATCH] adding OHD-Carolina --- Common/biolink_constants.py | 6 ++ Common/data_sources.py | 2 + graph_specs/cohd-graph-spec.yaml | 10 --- parsers/cohd/src/loadCOHD.py | 1 - parsers/ohd_carolina/src/loadOHD.py | 119 ++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 11 deletions(-) delete mode 100644 graph_specs/cohd-graph-spec.yaml create mode 100644 parsers/ohd_carolina/src/loadOHD.py diff --git a/Common/biolink_constants.py b/Common/biolink_constants.py index 15bd0a32..343c7ba2 100644 --- a/Common/biolink_constants.py +++ b/Common/biolink_constants.py @@ -55,6 +55,9 @@ KNOWLEDGE_LEVEL = 'knowledge_level' MAX_RESEARCH_PHASE = 'max_research_phase' HAS_SUPPORTING_STUDY_RESULT = 'has_supporting_study_result' +LOG_ODDS_RATIO = 'log_odds_ratio' +LOG_ODDS_RATIO_95_CI = 'log_odds_ratio_95_ci' +TOTAL_SAMPLE_SIZE = 'total_sample_size' # enums for knowledge level KNOWLEDGE_ASSERTION = 'knowledge_assertion' @@ -153,6 +156,9 @@ MECHANISM_OF_ACTION, MAX_RESEARCH_PHASE, HAS_SUPPORTING_STUDY_RESULT, + LOG_ODDS_RATIO, + LOG_ODDS_RATIO_95_CI, + TOTAL_SAMPLE_SIZE, # qualifiers ANATOMICAL_CONTEXT_QUALIFIER, CAUSAL_MECHANISM_QUALIFIER, diff --git a/Common/data_sources.py b/Common/data_sources.py index c504c44e..7fd0b26e 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -30,6 +30,7 @@ MOLEPRO = 'MolePro' MONARCH_KG = 'MonarchKG' MONDO_PROPS = 'MONDOProps' +OHD_CAROLINA = 'OHD-Carolina' ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy' PANTHER = 'PANTHER' PHAROS = 'PHAROS' @@ -82,6 +83,7 @@ MOLEPRO: ("parsers.molepro.src.loadMolePro", "MoleProLoader"), MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"), MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"), + OHD_CAROLINA: ("parsers.ohd_carolina.src.loadOHD", "OHDLoader"), ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"), PANTHER: ("parsers.panther.src.loadPanther", "PLoader"), PHAROS: ("parsers.PHAROS.src.loadPHAROS", "PHAROSLoader"), diff --git a/graph_specs/cohd-graph-spec.yaml b/graph_specs/cohd-graph-spec.yaml deleted file mode 100644 index 36149ff4..00000000 --- a/graph_specs/cohd-graph-spec.yaml +++ /dev/null @@ -1,10 +0,0 @@ -graphs: - - - graph_id: COHD_Automat - graph_name: COHD - graph_description: - graph_url: - conflation: False - output_format: neo4j - sources: - - source_id: COHD \ No newline at end of file diff --git a/parsers/cohd/src/loadCOHD.py b/parsers/cohd/src/loadCOHD.py index 3840e469..9c6bb04b 100644 --- a/parsers/cohd/src/loadCOHD.py +++ b/parsers/cohd/src/loadCOHD.py @@ -4,7 +4,6 @@ import yaml from Common.loader_interface import SourceDataLoader -from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE from Common.utils import GetData, quick_jsonl_file_iterator diff --git a/parsers/ohd_carolina/src/loadOHD.py b/parsers/ohd_carolina/src/loadOHD.py new file mode 100644 index 00000000..9e7acaf8 --- /dev/null +++ b/parsers/ohd_carolina/src/loadOHD.py @@ -0,0 +1,119 @@ + +import os +import requests +import yaml +import enum +import orjson + +from io import TextIOWrapper +from zipfile import ZipFile +from Common.extractor import Extractor +from Common.loader_interface import SourceDataLoader +from Common.biolink_constants import * +from Common.utils import GetData + + +class EDGESDATACOLS(enum.IntEnum): + SUBJECT_ID = 0 + SUBJECT_NAME = 1 + OBJECT_ID = 2 + OBJECT_NAME = 3 + PREDICATE = 4 + CHI_SQUARED_P_VALUE = 5 + LOG_ODDS_RATIO = 6 + LOG_ODDS_RATIO_95_CI = 7 + SCORE = 8 + TOTAL_SAMPLE_SIZE = 9 + PRIMARY_KS = 10 + + +############## +# Class: OHD source loader +# +# Desc: Class that loads/parses the Open Health Data @ Carolina data. +############## +class OHDLoader(SourceDataLoader): + + source_id: str = 'OHD-Carolina' + provenance_id: str = 'infores:openhealthdata-carolina' + parsing_version: str = '1.0' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.data_url = 'https://stars.renci.org/var/data_services/ohd/' + self.version_file = 'ohd.yaml' + self.ohd_archive_file = 'unc_omop_2018_2022_kg.zip' + self.ohd_edges_file = 'unc_omop_2018_2022_kg.csv' + self.data_files = [self.ohd_archive_file] + + def get_latest_source_version(self) -> str: + version_file_url = f"{self.data_url}{self.version_file}" + r = requests.get(version_file_url) + if not r.ok: + r.raise_for_status() + version_yaml = yaml.full_load(r.text) + build_version = str(version_yaml['build']) + return build_version + + def get_data(self) -> bool: + for data_file in self.data_files: + source_data_url = f'{self.data_url}{data_file}' + data_puller = GetData() + data_puller.pull_via_http(source_data_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + + :return: ret_val: load_metadata + """ + extractor = Extractor(file_writer=self.output_file_writer) + + ohd_archive_file_path: str = os.path.join(self.data_path, self.ohd_archive_file) + with ZipFile(ohd_archive_file_path) as ohd_archive: + with ohd_archive.open(self.ohd_edges_file, "r") as fp: + extractor.csv_extract(TextIOWrapper(fp), + lambda line: line[EDGESDATACOLS.SUBJECT_ID.value], # subject id + lambda line: line[EDGESDATACOLS.OBJECT_ID.value], # object id + lambda line: line[EDGESDATACOLS.PREDICATE], # predicate extractor + lambda line: {NAME: line[EDGESDATACOLS.SUBJECT_NAME.value]}, # subject props + lambda line: {NAME: line[EDGESDATACOLS.OBJECT_NAME.value]}, # object props + lambda line: self.get_edge_properties(line), # edgeprops + comment_character=None, + delim=',', + has_header_row=True) + + return extractor.load_metadata + + @staticmethod + def get_edge_properties(line): + return { + AGENT_TYPE: DATA_PIPELINE, + KNOWLEDGE_LEVEL: STATISTICAL_ASSOCIATION, + 'score': line[EDGESDATACOLS.SCORE.value], + PRIMARY_KNOWLEDGE_SOURCE: line[EDGESDATACOLS.PRIMARY_KS.value], + P_VALUE: float(line[EDGESDATACOLS.CHI_SQUARED_P_VALUE.value]), + LOG_ODDS_RATIO: float(line[EDGESDATACOLS.LOG_ODDS_RATIO.value]), + LOG_ODDS_RATIO_95_CI: orjson.loads(line[EDGESDATACOLS.LOG_ODDS_RATIO_95_CI.value]), + TOTAL_SAMPLE_SIZE: int(line[EDGESDATACOLS.TOTAL_SAMPLE_SIZE.value]) + } + + """ + # this should probably be something like this instead to match COHD, + because merged edges wont be able to handle conflicting attributes across multiple supporting studies + 'attributes': [orjson.dumps({ + HAS_SUPPORTING_STUDY_RESULT: [{ + P_VALUE: float(line[EDGESDATACOLS.CHI_SQUARED_P_VALUE.value]), + LOG_ODDS_RATIO: float(line[EDGESDATACOLS.LOG_ODDS_RATIO.value]), + LOG_ODDS_RATIO_95_CI: orjson.loads(line[EDGESDATACOLS.LOG_ODDS_RATIO_95_CI.value]), + TOTAL_SAMPLE_SIZE: int(line[EDGESDATACOLS.TOTAL_SAMPLE_SIZE.value]) + }] + }).decode('utf-8')] + """ +