Skip to content

Commit

Permalink
Create JGI_genbank.py
Browse files Browse the repository at this point in the history
  • Loading branch information
tslaird authored Dec 20, 2018
1 parent 2f437d2 commit 3f71a52
Showing 1 changed file with 70 additions and 0 deletions.
70 changes: 70 additions & 0 deletions JGI_genbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#For batch downloading the tar bundles of IMG data which includes both annotation
#and fasta files.

#file is a .txt file containing IMG genome id's
#username is your IMG username/email entered as a string in quotes
#password is your IMG account password entered as a string in quotes

def get_gbk(file, username, password):
import pycurl
from io import BytesIO
import urllib.parse
import re
import time
import os

with open(file) as f:
IMG_ids = f.read().splitlines()

not_in_Genome_Portal=[]
for id in IMG_ids:
available_files = BytesIO()
c = pycurl.Curl()
c.setopt(c.VERBOSE,1)
url='https://signon-old.jgi.doe.gov/signon/create'
params={'login':username,'password':password}
c.setopt(c.URL, url)
c.setopt(c.POSTFIELDS, urllib.parse.urlencode(params).encode())
c.setopt(c.COOKIEFILE, 'cookies')
c.perform()
id_to_search= 'IMG_'+str(id)
print(id_to_search)
c.setopt(c.URL,'https://genome.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism='+id_to_search)
c.setopt(c.COOKIEJAR, 'cookies')
c.setopt(c.HTTPGET,1)
#file_handle = open("file.xml","wb")
c.setopt(c.WRITEDATA, available_files)
#c.setopt(c.WRITEDATA, file_handle)
c.perform()
available_files
body = available_files.getvalue()
x=body.decode('iso-8859-1')
print(x)
if x == 'Portal does not exist':
print(id_to_search+':'+ x)
not_in_Genome_Portal.append(id_to_search)
else:
labels=re.findall('(?<=file label=")(.*?)(?=")',x)[0]
labels=re.sub(' ','_',labels)
labels=re.sub('\(|\)','',labels)
labels=re.sub('-','_',labels)
name=re.findall('(?<=filename=")(.*assembled\.gbk)(?=" size)',x)
urls=re.findall('(?<=\;url=)(.*assembled\.gbk)(?=")',x)

name_of_file=labels+'_'+str(name[0])
print(name_of_file)

file_to_download= 'https://genome.jgi.doe.gov/portal/ext-api/downloads/get_tape_file?blocking=true&url='+str(urls[0])

c.setopt(c.URL,file_to_download)
c.setopt(c.COOKIEJAR, 'cookies')
c.setopt(c.HTTPGET,1)
file_handle2 = open(name_of_file,"wb")
c.setopt(c.WRITEDATA, file_handle2)
c.perform()
c.close()
time.sleep(3)
print('could not download '+str(len(not_in_Genome_Portal))+ ' Genome files:')
print(not_in_Genome_Portal)

0 comments on commit 3f71a52

Please sign in to comment.