-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexercise.py
139 lines (122 loc) · 5.38 KB
/
exercise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
import rdflib
from rdflib.plugins.sparql.processor import SPARQLResult
from rdfpandas.graph import to_dataframe
import pandas as pd
from pandas import DataFrame
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import urllib
import json
import os
# Change directories to /input/
os.chdir('input')
# Make the Turtle file into a DataFrame
g = rdflib.Graph()
g.parse('programming_exercise.skos.ttl', format = 'ttl')
df = to_dataframe(g)
# Query via SPARQL on our Turtle file
results = g.query("""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?concept ?label
WHERE {
?concept skos:prefLabel ?label .
?concept skos:broader* <http://purl.obolibrary.org/obo/DOID_4159> .
}
""")
# Define our function to store the SPARQLResult object in a DataFrame
def sparql_results_to_df(results: SPARQLResult) -> DataFrame:
"""
Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
"""
return DataFrame(
data=([None if x is None else x.toPython() for x in row] for row in results),
columns=[str(x) for x in results.vars],
)
# Use our function to produce the DataFrame
df1 = sparql_results_to_df(results)
# Do some regex to get rid of hyperlinks and replace underscores with colons
df1['concept'] = df1['concept'].str.replace(r'_', ':', regex=True)
df1['concept'] = df1['concept'].str.replace(r'^http://purl.obolibrary.org/obo/', '', regex=True)
# Set up some variables to prep for API calls
labels = df1['label']
ids = df1['concept']
# API call to MONDO using labels to search
iris = []
for label in labels:
encodedLabel = urllib.parse.quote(label)
url = "http://www.ebi.ac.uk/ols/api/select?q="+encodedLabel+"&queryFields=label&ontology=mondo&fieldList=id,iri,label,score,synonym"
response = requests.get(url).json()
if response['response']['numFound'] == 1:
iri_hit = [i['iri'] for i in response["response"]["docs"]]
iri_hit = str(iri_hit)[2:-2]
iris.append(iri_hit.replace("http://purl.obolibrary.org/obo/", ""))
elif response['response']['numFound'] > 1:
iri_hits = [i['iri'] for i in response["response"]["docs"]]
iri_hit = iri_hits[0]
iris.append(iri_hit.replace("http://purl.obolibrary.org/obo/", ""))
else:
iri_hit = "No result"
iris.append(iri_hit)
# Add results to new column in DataFrame
df1['MONDO_IRI'] = iris
# API call to EFO ontology using labels to search
iris = []
for label in labels:
encodedLabel = urllib.parse.quote(label)
url = "http://www.ebi.ac.uk/ols/api/select?q="+encodedLabel+"&queryFields=label&ontology=efo&fieldList=id,iri,label,score,synonym"
response = requests.get(url).json()
if response['response']['numFound'] == 1:
iri_hit = [i['iri'] for i in response["response"]["docs"]]
iri_hit = str(iri_hit)[2:-2]
iris.append(iri_hit.replace("http://www.ebi.ac.uk/efo/", ""))
elif response['response']['numFound'] > 1:
iri_hits = [i['iri'] for i in response["response"]["docs"]]
iri_hit = iri_hits[0]
iris.append(iri_hit.replace("http://www.ebi.ac.uk/efo/", ""))
else:
iri_hit = "No result"
iris.append(iri_hit)
# Add results to new column in DataFrame
df1['EFO_IRI'] = iris
# Replace any duplicate hits EFO returned with MONO URIs
df1['EFO_IRI'] = df1.EFO_IRI.str.replace(r'^http://purl.obolibrary.org/obo/.+', r'No result', regex=True)
# Join MONDO and EFO mappings separated by a semi-colon, also drop duplicate No results
df1["OLS_Mappings"] = df1[['MONDO_IRI', 'EFO_IRI']].agg('; '.join, axis=1)
df1['OLS_Mappings'] = df1['OLS_Mappings'].str.replace('No result; ', '')
df1['OLS_Mappings'] = df1['OLS_Mappings'].str.replace('; No result', '')
# API call to OXO mappings service using IDs
meshes = []
efos = []
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
for i in ids:
url = "https://www.ebi.ac.uk/spot/oxo/api/mappings?fromId="+i
response = session.get(url).json()
try:
mesh_hits = [i['toTerm']['curie'] for i in response['_embedded']['mappings'] if i['toTerm']['datasource']['prefix'] == 'MeSH']
efo_hits = [i['fromTerm']['curie'] for i in response['_embedded']['mappings'] if i['fromTerm']['datasource']['prefix'] == 'EFO']
efo_hits = str(efo_hits)[2:-2]
meshes.append(mesh_hits[0])
efos.append(efo_hits)
except:
meshes.append('No result')
efos.append('No result')
# Add results to new column in DataFrame
df1['MESH_ID'] = meshes
df1['EFO_ID'] = efos
# Join those columns into new column in DataFrame, also drop duplicate No results
df1["OXO_Mappings"] = df1[['MESH_ID', 'EFO_ID']].agg('; '.join, axis=1)
df1['OXO_Mappings'] = df1['OXO_Mappings'].str.replace('No result; ', '')
df1['OXO_Mappings'] = df1['OXO_Mappings'].str.replace('; No result', '')
# Make final DataFrame by subsetting df1
df_final = df1[['concept', 'label', 'OLS_Mappings', 'OXO_Mappings']]
# Write out the CSV to /output/ directory
# WARNING: When opening file in spreadsheet software, be sure to de-select semicolons as delimiters in the preview window!
df_final.to_csv('../output/output.csv')