Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for a couple issues in geneontology/noctua-models#55 #3

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
language: scala
sudo: required

before_install:
- sudo apt-get update -qq

install:
- sudo apt-get install -y python3-venv -qq

script:
- source environment.sh
- python3 scripts/syngo_uniprot_resolver.py -f resources/SynGO_export_test.json -o resources/SynGO_export_test.json
- sbt "run -ni resources/SynGO_export_test.json"
5 changes: 5 additions & 0 deletions environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
python3 -m venv env
. env/bin/activate

pip3 install -r requirements.txt
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas
requests
Binary file not shown.
94 changes: 94 additions & 0 deletions scripts/syngo_uniprot_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import requests
import argparse
from pandas import read_csv
from io import StringIO
from os import path
from uniprot_wrapper import UniprotWrapper, UniprotWrapper2

p = argparse.ArgumentParser()
p.add_argument('-f', "--filename", type=str, required=True, help="input filename of SynGO annotation export JSON")
p.add_argument('-o', "--outfile", type=str, required=False, help="output filename")

def main():
args = p.parse_args()

filename = args.filename
with open(filename) as f:
data = json.load(f)

wrapper = UniprotWrapper2()
extensions = []
ext_genes = []
ext_sets = []
id_map = {}

for a in data["SynGO"]:
for m in a['models']:
uniprot_id = m["uniprot"]
if uniprot_id not in id_map:
id_map[uniprot_id] = {}

id_map = wrapper.lookup_uniprot(list(id_map.keys()))

# <http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=620107>
# <http://www.informatics.jax.org/accession/MGI:MGI:95615>

for a in data["SynGO"]:
models = []
for m in a['models']:
uniprot_id = m['uniprot']
if id_map[uniprot_id] == {}:
uniprot_id = uniprot_id.split("-")[0] # Adjust for isoforms
noctua_gene_id = wrapper.get_noctua_gene_id(id_map, uniprot_id)
if noctua_gene_id is None:
noctua_gene_id = m["uniprot"]
m["noctua_gene_id"] = noctua_gene_id
resulting_dbs = list(set(wrapper.DBS_TO_LOOKUP) & set(id_map[uniprot_id].keys()))
if len(resulting_dbs) > 0:
m["id_db_source"] = resulting_dbs[0]
else:
m["id_db_source"] = "uniprot"
m["taxon_id"] = UniprotWrapper.get_field_for_id(id_map, "taxon_id", uniprot_id) # Gotta do these when can't infer from id_db_source
for field in wrapper.OTHER_FIELDS_TO_FETCH:
if field in id_map[uniprot_id]:
m[field] = UniprotWrapper.get_field_for_id(id_map, field, uniprot_id)
if "gene_symbol" in id_map[uniprot_id]:
m["gene_symbol"] = UniprotWrapper.get_field_for_id(id_map, "gene_symbol", uniprot_id)
if "gene_symbol" not in m:
# Report missing info for this one
print(m['uniprot'] + " - " + a['combi_id'] + " - " + str(UniprotWrapper.get_field_for_id(id_map, "GENENAME", uniprot_id)))
ext_relations = []
for e in m['extensions']:
for k in e.keys():
go_terms = []
uberon_terms = []
other_terms = []
if k not in extensions:
extensions.append(k)
if k not in ext_relations:
ext_relations.append(k)
for t in e[k]:
if ":" not in t and t not in ext_genes:
ext_genes.append(t)
if t.startswith("GO:"):
go_terms.append(t)
elif t.startswith("UBERON:"):
uberon_terms.append(t)
else:
other_terms.append(t)
# if len(go_terms) > 1 or len(uberon_terms) > 1 or len(other_terms) > 0:
# ext_sets.append([a["combi_id"],e[k]])
if len(set(ext_relations)) > 1:
ext_sets.append(a["combi_id"])
models.append(m)
a['models'] = models

outfile = args.outfile
if outfile is None:
outfile = path.splitext(filename)[0] + "_updated" + path.splitext(filename)[1]
with open(outfile, "w+") as wf:
json.dump(data, wf, indent=4)

if __name__ == "__main__":
main()
182 changes: 182 additions & 0 deletions scripts/uniprot_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import json
import requests
import argparse
import logging
from pandas import read_csv
from io import StringIO
from os import path
from UniProtMapper import ProtMapper


p = argparse.ArgumentParser()
p.add_argument('-f', "--filename", type=str, required=True, help="input filename of space-separated UniProt ID list")
p.add_argument('-o', "--outfile", type=str, required=False, help="output filename for result JSON")
p.add_argument('-d', "--debug_level", type=str, required=False, help="logging level")

class UniprotWrapper():
# DBS_TO_LOOKUP = ["MGI_ID", "RGD_ID", "FLYBASE_ID", "WORMBASE_ID", "HGNC_ID"]
DBS_TO_LOOKUP = ["MGI_ID", "RGD_ID", "FLYBASE_ID", "WORMBASE_ID"]
OTHER_FIELDS_TO_FETCH = ["GENENAME"]

def make_uniprot_call(self, uniprot_ids, current_map=None):
request_min = 0
while request_min < len(uniprot_ids):
print(request_min)
for field in self.DBS_TO_LOOKUP + self.OTHER_FIELDS_TO_FETCH:
r = requests.get('https://www.uniprot.org/uploadlists/?from=ACC&to=' + field + '&format=tab&query=' + " ".join(uniprot_ids[request_min:request_min+500]))
uniprot_results = read_csv(StringIO(r.text), delimiter='\t')

for index, row in uniprot_results.iterrows():
logging.debug(row[0] + " - " + field)
current_map[row[0]][field] = row[1]

request_min += 500 # For some reason requesting >1000 results in 400 error
return current_map

def lookup_uniprot(self, uniprot_ids, current_map=None, isoform_check=True):
if current_map is None:
current_map = {}
for uid in uniprot_ids:
current_map[uid] = {}
current_map = self.make_uniprot_call(uniprot_ids, current_map)

# Adjust for isoforms
if isoform_check:
redo_ids = []
for k in current_map:
if current_map[k] == {} and "-" in k:
redo_id = k.split("-")[0]
redo_ids.append(redo_id)
if redo_ids:
current_map = self.make_uniprot_call(redo_ids, current_map)

return current_map

@staticmethod
def one_off_call(uniprot_id):
r = requests.get('http://www.uniprot.org/uniprot/' + uniprot_id + '.txt')
return UniprotWrapper.get_gene_label(r.text.split("\n"))

@staticmethod
def get_organism_id(uniprot_id):
r = requests.get("https://rest.uniprot.org/uniprotkb/{}.tsv?fields=accession,organism_id".format(uniprot_id))
return UniprotWrapper.parse_taxon_from_tsv(r.text)

@staticmethod
def get_gene_label(result_lines):
gene_name = ""
species = ""
for line in result_lines:
if line.startswith("GN Name="):
gene_name = line[5:len(line)].split(";")[0].split("{")[0]
gene_name = gene_name[5:len(gene_name)].rstrip()
elif line.startswith("OS"):
species = line[5:len(line)]
species = species.split(" ")
species = species[0][0] + species[1][0:3]
label = gene_name + " " + species
return label

@staticmethod
def get_taxon_id(uniprot_id):
r = requests.get('http://www.uniprot.org/uniprot/' + uniprot_id + '.txt')
return UniprotWrapper.parse_taxon_from_txt(r.text)

@staticmethod
def parse_taxon_from_txt(result_text):
taxon_prefix = "OX NCBI_TaxID="
taxon_id = ""
for line in result_text.split("\n"):
if line.startswith(taxon_prefix):
taxon_id = line[len(taxon_prefix):len(line)].rstrip(";").split("{")[0].rstrip()
return taxon_id

@staticmethod
def parse_taxon_from_tsv(result_text):
taxon_id = ""
for line in result_text.split("\n"):
if not line or line.startswith("Entry"):
continue
else:
taxon_id = line.split("\t")[1].rstrip()
return taxon_id

@staticmethod
def get_field_for_id(current_map, field, uniprot_id):
if field in current_map[uniprot_id]:
return str(current_map[uniprot_id][field])

def get_noctua_gene_id(self, current_map, uniprot_id):
noctua_gene_id = None
for db in self.DBS_TO_LOOKUP:
if db in current_map[uniprot_id]:
noctua_gene_id = UniprotWrapper.get_field_for_id(current_map, db, uniprot_id)
return noctua_gene_id


class UniprotWrapper2(UniprotWrapper):
DBS_TO_LOOKUP = ["MGI", "RGD", "FlyBase", "WormBase", "ZFIN"]
OTHER_FIELDS_TO_FETCH = ["gene_names"]
mapper = ProtMapper()

def make_uniprot_call(self, uniprot_ids, current_map=None):
# mapped_ids = {}
for field in self.DBS_TO_LOOKUP:
result, failed = self.mapper.get(
# ids=list(uniprot_ids), from_db="UniProtKB_AC-ID", to_db=field, fields=self.OTHER_FIELDS_TO_FETCH
ids = list(uniprot_ids), from_db = "UniProtKB_AC-ID", to_db = field
)
# result is a pandas DataFrame
# iterate through result and add From key to mapped_ids with To as value
for index, row in result.iterrows():
from_id = row["From"]
to_id = row["To"]
# if field == "RGD" and not to_id.startswith("RGD:"):
# to_id = "RGD:" + str(int(to_id))
current_map[from_id][field] = to_id
# print("yo")

# set uniprot_ids to remaining for querying on the next DB
uniprot_ids = failed
# # Add remaining unmapped IDs to mapped_ids so they get printed out
# for uid in uniprot_ids:
# current_map[uid] = ""
result, failed = self.mapper.get(
# ids=list(uniprot_ids), from_db="UniProtKB_AC-ID", to_db=field, fields=self.OTHER_FIELDS_TO_FETCH
ids=list(current_map.keys())
)
for index, row in result.iterrows():
uniprot_id = row["From"]
gene_names = row["Gene Names"]
taxon_id = row["Organism (ID)"]
first_gene_name = gene_names.split(" ")[0]
current_map[uniprot_id]["gene_symbol"] = first_gene_name
current_map[uniprot_id]["taxon_id"] = taxon_id

return current_map


def main():
args = p.parse_args()

if args.debug_level is not None:
logging.basicConfig(level=args.debug_level)

filename = args.filename
outfile = args.outfile
id_map = {}
with open(filename) as f:

wrapper = UniprotWrapper()

id_map = wrapper.lookup_uniprot(f.read().split(" "))

if outfile is not None:
with open(path.splitext(filename)[0] + "_output" + path.splitext(filename)[1], "w") as wf:
json.dump(id_map, wf, indent=4)
else:
print(id_map)


if __name__ == "__main__":
main()
Loading