geneontology · dustine32 · Mar 19, 2018 · Mar 19, 2018 · Mar 21, 2018 · Mar 21, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,13 @@
 language: scala
+sudo: required
+
+before_install:
+   - sudo apt-get update -qq
+
+install:
+   - sudo apt-get install -y python3-venv -qq
 
 script:
+ - source environment.sh
+ - python3 scripts/syngo_uniprot_resolver.py -f resources/SynGO_export_test.json -o resources/SynGO_export_test.json
  - sbt "run -ni resources/SynGO_export_test.json"
diff --git a/environment.sh b/environment.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+python3 -m venv env
+. env/bin/activate
+
+pip3 install -r requirements.txt
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+requests
diff --git a/scripts/__pycache__/uniprot_wrapper.cpython-36.pyc b/scripts/__pycache__/uniprot_wrapper.cpython-36.pyc
diff --git a/scripts/syngo_uniprot_resolver.py b/scripts/syngo_uniprot_resolver.py
@@ -0,0 +1,94 @@
+import json
+import requests
+import argparse
+from pandas import read_csv
+from io import StringIO
+from os import path
+from uniprot_wrapper import UniprotWrapper, UniprotWrapper2
+
+p = argparse.ArgumentParser()
+p.add_argument('-f', "--filename", type=str, required=True, help="input filename of SynGO annotation export JSON")
+p.add_argument('-o', "--outfile", type=str, required=False, help="output filename")
+
+def main():
+    args = p.parse_args()
+
+    filename = args.filename
+    with open(filename) as f:
+        data = json.load(f)
+
+    wrapper = UniprotWrapper2()
+    extensions = []
+    ext_genes = []
+    ext_sets = []
+    id_map = {}
+
+    for a in data["SynGO"]:
+        for m in a['models']:
+            uniprot_id = m["uniprot"]
+            if uniprot_id not in id_map:
+                id_map[uniprot_id] = {}
+
+    id_map = wrapper.lookup_uniprot(list(id_map.keys()))
+
+    # <http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=620107>
+    # <http://www.informatics.jax.org/accession/MGI:MGI:95615>
+
+    for a in data["SynGO"]:
+        models = []
+        for m in a['models']:
+            uniprot_id = m['uniprot']
+            if id_map[uniprot_id] == {}:
+                uniprot_id = uniprot_id.split("-")[0] # Adjust for isoforms
+            noctua_gene_id = wrapper.get_noctua_gene_id(id_map, uniprot_id)
+            if noctua_gene_id is None:
+                noctua_gene_id = m["uniprot"]
+            m["noctua_gene_id"] = noctua_gene_id
+            resulting_dbs = list(set(wrapper.DBS_TO_LOOKUP) & set(id_map[uniprot_id].keys()))
+            if len(resulting_dbs) > 0:
+                m["id_db_source"] = resulting_dbs[0]
+            else:
+                m["id_db_source"] = "uniprot"
+                m["taxon_id"] = UniprotWrapper.get_field_for_id(id_map, "taxon_id", uniprot_id)  # Gotta do these when can't infer from id_db_source
+            for field in wrapper.OTHER_FIELDS_TO_FETCH:
+                if field in id_map[uniprot_id]:
+                    m[field] = UniprotWrapper.get_field_for_id(id_map, field, uniprot_id)
+            if "gene_symbol" in id_map[uniprot_id]:
+                m["gene_symbol"] = UniprotWrapper.get_field_for_id(id_map, "gene_symbol", uniprot_id)
+            if "gene_symbol" not in m:
+                # Report missing info for this one
+                print(m['uniprot'] + " - " + a['combi_id'] + " - " + str(UniprotWrapper.get_field_for_id(id_map, "GENENAME", uniprot_id)))
+            ext_relations = []
+            for e in m['extensions']:
+                for k in e.keys():
+                    go_terms = []
+                    uberon_terms = []
+                    other_terms = []
+                    if k not in extensions:
+                        extensions.append(k)
+                    if k not in ext_relations:
+                        ext_relations.append(k)
+                    for t in e[k]:
+                        if ":" not in t and t not in ext_genes:
+                            ext_genes.append(t)
+                        if t.startswith("GO:"):
+                            go_terms.append(t)
+                        elif t.startswith("UBERON:"):
+                            uberon_terms.append(t)
+                        else:
+                            other_terms.append(t)
+                    # if len(go_terms) > 1 or len(uberon_terms) > 1 or len(other_terms) > 0:
+                    #     ext_sets.append([a["combi_id"],e[k]])
+            if len(set(ext_relations)) > 1:
+                ext_sets.append(a["combi_id"])
+            models.append(m)
+        a['models'] = models
+
+    outfile = args.outfile
+    if outfile is None:
+        outfile = path.splitext(filename)[0] + "_updated" + path.splitext(filename)[1]
+    with open(outfile, "w+") as wf:
+        json.dump(data, wf, indent=4)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/uniprot_wrapper.py b/scripts/uniprot_wrapper.py
@@ -0,0 +1,182 @@
+import json
+import requests
+import argparse
+import logging
+from pandas import read_csv
+from io import StringIO
+from os import path
+from UniProtMapper import ProtMapper
+
+
+p = argparse.ArgumentParser()
+p.add_argument('-f', "--filename", type=str, required=True, help="input filename of space-separated UniProt ID list")
+p.add_argument('-o', "--outfile", type=str, required=False, help="output filename for result JSON")
+p.add_argument('-d', "--debug_level", type=str, required=False, help="logging level")
+
+class UniprotWrapper():
+    # DBS_TO_LOOKUP = ["MGI_ID", "RGD_ID", "FLYBASE_ID", "WORMBASE_ID", "HGNC_ID"]
+    DBS_TO_LOOKUP = ["MGI_ID", "RGD_ID", "FLYBASE_ID", "WORMBASE_ID"]
+    OTHER_FIELDS_TO_FETCH = ["GENENAME"]
+
+    def make_uniprot_call(self, uniprot_ids, current_map=None):
+        request_min = 0
+        while request_min < len(uniprot_ids):
+            print(request_min)
+            for field in self.DBS_TO_LOOKUP + self.OTHER_FIELDS_TO_FETCH:
+                r = requests.get('https://www.uniprot.org/uploadlists/?from=ACC&to=' + field + '&format=tab&query=' + " ".join(uniprot_ids[request_min:request_min+500]))
+                uniprot_results = read_csv(StringIO(r.text), delimiter='\t')
+
+                for index, row in uniprot_results.iterrows():
+                    logging.debug(row[0] + " - " + field)
+                    current_map[row[0]][field] = row[1]
+
+            request_min += 500 # For some reason requesting >1000 results in 400 error
+        return current_map
+
+    def lookup_uniprot(self, uniprot_ids, current_map=None, isoform_check=True):
+        if current_map is None:
+            current_map = {}
+            for uid in uniprot_ids:
+                current_map[uid] = {}
+        current_map = self.make_uniprot_call(uniprot_ids, current_map)
+
+        # Adjust for isoforms
+        if isoform_check:
+            redo_ids = []
+            for k in current_map:
+                if current_map[k] == {} and "-" in k:
+                    redo_id = k.split("-")[0]
+                    redo_ids.append(redo_id)
+            if redo_ids:
+                current_map = self.make_uniprot_call(redo_ids, current_map)
+
+        return current_map
+
+    @staticmethod
+    def one_off_call(uniprot_id):
+        r = requests.get('http://www.uniprot.org/uniprot/' + uniprot_id + '.txt')
+        return UniprotWrapper.get_gene_label(r.text.split("\n"))
+
+    @staticmethod
+    def get_organism_id(uniprot_id):
+        r = requests.get("https://rest.uniprot.org/uniprotkb/{}.tsv?fields=accession,organism_id".format(uniprot_id))
+        return UniprotWrapper.parse_taxon_from_tsv(r.text)
+
+    @staticmethod
+    def get_gene_label(result_lines):
+        gene_name = ""
+        species = ""
+        for line in result_lines:
+            if line.startswith("GN   Name="):
+                gene_name = line[5:len(line)].split(";")[0].split("{")[0]
+                gene_name = gene_name[5:len(gene_name)].rstrip()
+            elif line.startswith("OS"):
+                species = line[5:len(line)]
+                species = species.split(" ")
+                species = species[0][0] + species[1][0:3]
+        label = gene_name + " " + species
+        return label
+
+    @staticmethod
+    def get_taxon_id(uniprot_id):
+        r = requests.get('http://www.uniprot.org/uniprot/' + uniprot_id + '.txt')
+        return UniprotWrapper.parse_taxon_from_txt(r.text)
+
+    @staticmethod
+    def parse_taxon_from_txt(result_text):
+        taxon_prefix = "OX   NCBI_TaxID="
+        taxon_id = ""
+        for line in result_text.split("\n"):
+            if line.startswith(taxon_prefix):
+                taxon_id = line[len(taxon_prefix):len(line)].rstrip(";").split("{")[0].rstrip()
+        return taxon_id
+
+    @staticmethod
+    def parse_taxon_from_tsv(result_text):
+        taxon_id = ""
+        for line in result_text.split("\n"):
+            if not line or line.startswith("Entry"):
+                continue
+            else:
+                taxon_id = line.split("\t")[1].rstrip()
+        return taxon_id
+
+    @staticmethod
+    def get_field_for_id(current_map, field, uniprot_id):
+        if field in current_map[uniprot_id]: 
+            return str(current_map[uniprot_id][field])
+
+    def get_noctua_gene_id(self, current_map, uniprot_id):
+        noctua_gene_id = None
+        for db in self.DBS_TO_LOOKUP:
+            if db in current_map[uniprot_id]:
+                noctua_gene_id = UniprotWrapper.get_field_for_id(current_map, db, uniprot_id)
+        return noctua_gene_id
+
+
+class UniprotWrapper2(UniprotWrapper):
+    DBS_TO_LOOKUP = ["MGI", "RGD", "FlyBase", "WormBase", "ZFIN"]
+    OTHER_FIELDS_TO_FETCH = ["gene_names"]
+    mapper = ProtMapper()
+
+    def make_uniprot_call(self, uniprot_ids, current_map=None):
+        # mapped_ids = {}
+        for field in self.DBS_TO_LOOKUP:
+            result, failed = self.mapper.get(
+                # ids=list(uniprot_ids), from_db="UniProtKB_AC-ID", to_db=field, fields=self.OTHER_FIELDS_TO_FETCH
+                ids = list(uniprot_ids), from_db = "UniProtKB_AC-ID", to_db = field
+            )
+            # result is a pandas DataFrame
+            # iterate through result and add From key to mapped_ids with To as value
+            for index, row in result.iterrows():
+                from_id = row["From"]
+                to_id = row["To"]
+                # if field == "RGD" and not to_id.startswith("RGD:"):
+                #     to_id = "RGD:" + str(int(to_id))
+                current_map[from_id][field] = to_id
+                # print("yo")
+
+            # set uniprot_ids to remaining for querying on the next DB
+            uniprot_ids = failed
+        # # Add remaining unmapped IDs to mapped_ids so they get printed out
+        # for uid in uniprot_ids:
+        #     current_map[uid] = ""
+        result, failed = self.mapper.get(
+            # ids=list(uniprot_ids), from_db="UniProtKB_AC-ID", to_db=field, fields=self.OTHER_FIELDS_TO_FETCH
+            ids=list(current_map.keys())
+        )
+        for index, row in result.iterrows():
+            uniprot_id = row["From"]
+            gene_names = row["Gene Names"]
+            taxon_id = row["Organism (ID)"]
+            first_gene_name = gene_names.split(" ")[0]
+            current_map[uniprot_id]["gene_symbol"] = first_gene_name
+            current_map[uniprot_id]["taxon_id"] = taxon_id
+
+        return current_map
+
+
+def main():
+    args = p.parse_args()
+
+    if args.debug_level is not None:
+        logging.basicConfig(level=args.debug_level)
+
+    filename = args.filename
+    outfile = args.outfile
+    id_map = {}
+    with open(filename) as f:
+
+        wrapper = UniprotWrapper()    
+
+        id_map = wrapper.lookup_uniprot(f.read().split(" "))
+
+    if outfile is not None:
+        with open(path.splitext(filename)[0] + "_output" + path.splitext(filename)[1], "w") as wf:
+            json.dump(id_map, wf, indent=4)
+    else:
+        print(id_map)
+
+
+if __name__ == "__main__":
+    main()