-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJGI_genbank.py
70 lines (63 loc) · 2.6 KB
/
JGI_genbank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#For batch downloading the tar bundles of IMG data which includes both annotation
#and fasta files.
#file is a .txt file containing IMG genome id's
#username is your IMG username/email entered as a string in quotes
#password is your IMG account password entered as a string in quotes
def get_gbk(file, username, password):
import pycurl
from io import BytesIO
import urllib.parse
import re
import time
import os
with open(file) as f:
IMG_ids = f.read().splitlines()
not_in_Genome_Portal=[]
for id in IMG_ids:
available_files = BytesIO()
c = pycurl.Curl()
c.setopt(c.VERBOSE,1)
url='https://signon-old.jgi.doe.gov/signon/create'
params={'login':username,'password':password}
c.setopt(c.URL, url)
c.setopt(c.POSTFIELDS, urllib.parse.urlencode(params).encode())
c.setopt(c.COOKIEFILE, 'cookies')
c.perform()
id_to_search= 'IMG_'+str(id)
print(id_to_search)
c.setopt(c.URL,'https://genome.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism='+id_to_search)
c.setopt(c.COOKIEJAR, 'cookies')
c.setopt(c.HTTPGET,1)
#file_handle = open("file.xml","wb")
c.setopt(c.WRITEDATA, available_files)
#c.setopt(c.WRITEDATA, file_handle)
c.perform()
available_files
body = available_files.getvalue()
x=body.decode('iso-8859-1')
print(x)
if x == 'Portal does not exist':
print(id_to_search+':'+ x)
not_in_Genome_Portal.append(id_to_search)
else:
labels=re.findall('(?<=file label=")(.*?)(?=")',x)[0]
labels=re.sub(' ','_',labels)
labels=re.sub('\(|\)','',labels)
labels=re.sub('-','_',labels)
name=re.findall('(?<=filename=")(.*assembled\.gbk)(?=" size)',x)
urls=re.findall('(?<=\;url=)(.*assembled\.gbk)(?=")',x)
name_of_file=labels+'_'+str(name[0])
print(name_of_file)
file_to_download= 'https://genome.jgi.doe.gov/portal/ext-api/downloads/get_tape_file?blocking=true&url='+str(urls[0])
c.setopt(c.URL,file_to_download)
c.setopt(c.COOKIEJAR, 'cookies')
c.setopt(c.HTTPGET,1)
file_handle2 = open(name_of_file,"wb")
c.setopt(c.WRITEDATA, file_handle2)
c.perform()
c.close()
time.sleep(3)
print('could not download '+str(len(not_in_Genome_Portal))+ ' Genome files:')
print(not_in_Genome_Portal)