Skip to content

Commit

Permalink
Expanded support of which accession will download a WGS set
Browse files Browse the repository at this point in the history
Previously, only LLLLVV prefix would work (eg AAAK03).
Have now added the following:
LLLL - unversioned prefix, will get latest WGS set version. E.g. AAAK
LLLLVV000000 - versioned master accession. E.g. AAAK03000000
LLLL00000000 - unversioned master accession. E.g. AAAK00000000
  • Loading branch information
nicsilvester committed Jun 14, 2017
1 parent 6af3b0b commit 9d37269
Show file tree
Hide file tree
Showing 14 changed files with 170 additions and 57 deletions.
4 changes: 2 additions & 2 deletions python/analysisGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def set_parser():
help='Destination directory (default is current running directory)')
parser.add_argument('-m', '--meta', action='store_true',
help='Download analysis XML in addition to data files (default is false)')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser


Expand All @@ -38,7 +38,7 @@ def set_parser():

try:
readGet.download_files(accession, utils.SUBMITTED_FORMAT, dest_dir, False, fetch_meta)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python/assemblyGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def set_parser():
help='Destination directory (default is current running directory)')
parser.add_argument('-w', '--wgs', action='store_true',
help='Download WGS set if available (default is false)')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def check_format(format):
Expand Down Expand Up @@ -133,7 +133,7 @@ def download_assembly(dest_dir, accession, format, fetch_wgs, quiet=False):

try:
download_assembly(dest_dir, accession, format, fetch_wgs)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python/enaDataGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options. """)
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser


Expand Down Expand Up @@ -73,7 +73,7 @@ def set_parser():
else:
print 'Error: Invalid accession provided'
sys.exit(1)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python/enaGroupGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options. """)
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def download_report(group, result, accession, temp_file):
Expand Down Expand Up @@ -146,7 +146,7 @@ def download_group(accession, group, format, dest_dir, fetch_wgs, fetch_meta, fe

try:
download_group(accession, group, format, dest_dir, fetch_wgs, fetch_meta, fetch_index)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python/readGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options""")
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def check_read_format(format):
Expand Down Expand Up @@ -132,7 +132,7 @@ def download_files(accession, format, dest_dir, fetch_index, fetch_meta):

try:
download_files(accession, format, dest_dir, fetch_index, fetch_meta)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
53 changes: 40 additions & 13 deletions python/sequenceGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ def set_parser():
parser = argparse.ArgumentParser(prog='sequenceGet',
description='Download sequence data for a given INSDC accession')
parser.add_argument('accession', help='INSDC sequence/coding accession or WGS prefix (LLLLVV) to fetch')
parser.add_argument('-f', '--format', default='embl', choices=['embl', 'fasta'],
help='File format required (default is embl)')
parser.add_argument('-f', '--format', default='embl', choices=['embl', 'fasta', 'master'],
help='File format required (default is embl); master format only available for WGS')
parser.add_argument('-d', '--dest', default='.',
help='Destination directory (default is current running directory)')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def append_record(dest_file, accession, format):
Expand All @@ -28,44 +28,71 @@ def download_sequence(dest_dir, accession, format):
success = utils.download_record(dest_dir, accession, format)
if not success:
print 'Unable to fetch file for ' + accession + ', format ' + format
return success

def download_wgs(dest_dir, accession, format):
if utils.is_unversioned_wgs_set(accession):
return download_unversioned_wgs(dest_dir, accession, format)
else:
return download_versioned_wgs(dest_dir, accession, format)

def download_versioned_wgs(dest_dir, accession, format):
prefix = accession[:6]
if format is None:
format = utils.EMBL_FORMAT
public_set_url = utils.get_wgs_ftp_url(accession, utils.PUBLIC, format)
supp_set_url = utils.get_wgs_ftp_url(accession, utils.SUPPRESSED, format)
public_set_url = utils.get_wgs_ftp_url(prefix, utils.PUBLIC, format)
supp_set_url = utils.get_wgs_ftp_url(prefix, utils.SUPPRESSED, format)
success = utils.get_ftp_file(public_set_url, dest_dir)
if not success:
success = utils.get_ftp_file(supp_set_url, dest_dir)
if not success:
print 'No WGS set file available for ' + accession + ', format ' + format
print 'Please contact ENA ([email protected]) if you feel this set should be available'

def download_unversioned_wgs(dest_dir, accession, format):
prefix = accession[:4]
if format is None:
format = utils.EMBL_FORMAT
public_set_url = utils.get_nonversioned_wgs_ftp_url(prefix, utils.PUBLIC, format)
if public_set_url is not None:
utils.get_ftp_file(public_set_url, dest_dir)
else:
supp_set_url = utils.get_nonversion_supp_wgs_ftp_url(prefix, format)
if supp_set_url is not None:
utils.get_ftp_file(supp_set_url, dest_dir)
else:
print 'No WGS set file available for ' + accession + ', format ' + format
print 'Please contact ENA ([email protected]) if you feel this set should be available'

def check_format(format):
if format not in [utils.EMBL_FORMAT, utils.FASTA_FORMAT]:
print 'Please select a valid format for this accession: ', [utils.EMBL_FORMAT, utils.FASTA_FORMAT]
allowed_formats = [utils.EMBL_FORMAT, utils.FASTA_FORMAT, utils.MASTER_FORMAT]
if format not in allowed_formats:
print 'Please select a valid format for this accession: ', allowed_formats
sys.exit(1)

if __name__ == '__main__':
parser = set_parser()
args = parser.parse_args()

accession = args.accession
accession = args.accession.upper()
format = args.format
dest_dir = args.dest

try:
if utils.is_sequence(accession) or utils.is_coding(accession):
if utils.is_wgs_set(accession):
download_wgs(dest_dir, accession, format)
elif utils.is_sequence(accession) or utils.is_coding(accession):
if not utils.is_available(accession):
print 'Record does not exist or is not available for accession provided'
sys.exit(1)
if format == utils.MASTER_FORMAT:
print 'Invalid format. master format only available for WGS sets'
sys.exit(1)
download_sequence(dest_dir, accession, format)
elif utils.is_wgs_set(accession):
download_wgs(dest_dir, accession, format)
else:
print 'Error: Invalid accession. A sequence or coding accession or a WGS set prefix (LLLLVV) must be provided'
print 'Error: Invalid accession. A sequence or coding accession or a WGS set (prefix or master accession) must be provided'
sys.exit(1)
print 'Download completed'
print 'Completed'
except Exception:
utils.print_error()
sys.exit(1)
42 changes: 36 additions & 6 deletions python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

import base64
import ftplib
import re
import os
import subprocess
Expand Down Expand Up @@ -55,6 +56,7 @@
SEQUENCE_RELEASE_ID='sequence_release'

WGS_FTP_BASE = 'ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs'
WGS_FTP_DIR = 'pub/databases/ena/wgs'

PORTAL_SEARCH_BASE = 'http://www.ebi.ac.uk/ena/portal/api/search?'
RUN_RESULT = 'result=read_run'
Expand All @@ -78,6 +80,9 @@
sequence_pattern_3 = re.compile('^[A-Z]{4}[0-9]{8,9}(\.[0-9]+)?$')
coding_pattern = re.compile('^[A-Z]{3}[0-9]{5}(\.[0-9]+)?$')
wgs_prefix_pattern = re.compile('^[A-Z]{4}[0-9]{2}$')
wgs_master_pattern = re.compile('^[A-Z]{4}[0-9]{2}[0]{6}$')
unversion_wgs_prefix_pattern = re.compile('^[A-Z]{4}$')
unversion_wgs_master_pattern = re.compile('^[A-Z]{4}[0]{8}$')
run_pattern = re.compile('^[EDS]RR[0-9]{6,7}$')
experiment_pattern = re.compile('^[EDS]RX[0-9]{6,7}$')
analysis_pattern = re.compile('^[EDS]RZ[0-9]{6,7}$')
Expand All @@ -102,7 +107,14 @@ def is_coding(accession):
return coding_pattern.match(accession)

def is_wgs_set(accession):
return wgs_prefix_pattern.match(accession)
return wgs_prefix_pattern.match(accession) \
or wgs_master_pattern.match(accession) \
or unversion_wgs_prefix_pattern.match(accession) \
or unversion_wgs_master_pattern.match(accession)

def is_unversioned_wgs_set(accession):
return unversion_wgs_prefix_pattern.match(accession) \
or unversion_wgs_master_pattern.match(accession)

def is_run(accession):
return run_pattern.match(accession)
Expand Down Expand Up @@ -244,14 +256,32 @@ def get_ftp_file_with_md5_check(ftp_url, dest_dir, md5):
except Exception:
return False

def get_wgs_ftp_url(wgs_set, status, format):
base_url = WGS_FTP_BASE + '/' + status + '/' + wgs_set[:2].lower() + '/' + wgs_set
def get_wgs_file_ext(format):
if format == EMBL_FORMAT:
return base_url + WGS_EMBL_EXT
return WGS_EMBL_EXT
elif format == FASTA_FORMAT:
return base_url + WGS_FASTA_EXT
return WGS_FASTA_EXT
elif format == MASTER_FORMAT:
return base_url + WGS_MASTER_EXT
return WGS_MASTER_EXT

def get_wgs_ftp_url(wgs_set, status, format):
base_url = WGS_FTP_BASE + '/' + status + '/' + wgs_set[:2].lower() + '/' + wgs_set
return base_url + get_wgs_file_ext(format)

def get_nonversioned_wgs_ftp_url(wgs_set, status, format):
ftp_url = 'ftp.ebi.ac.uk'
base_dir = WGS_FTP_DIR + '/' + status + '/' + wgs_set[:2].lower()
base_url = WGS_FTP_BASE + '/' + status + '/' + wgs_set[:2].lower()
ftp = ftplib.FTP(ftp_url)
ftp.login()
ftp.cwd(base_dir)
supp = ftp.nlst()
ftp.close()
files = [f for f in supp if f.startswith(wgs_set) and f.endswith(get_wgs_file_ext(format))]
if len(files) == 0:
return None
else:
return base_url + '/' + max(files)

def get_report_from_portal(url):
request = urllib2.Request(url)
Expand Down
4 changes: 2 additions & 2 deletions python3/analysisGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def set_parser():
help='Destination directory (default is current running directory)')
parser.add_argument('-m', '--meta', action='store_true',
help='Download analysis XML in addition to data files (default is false)')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser


Expand All @@ -38,7 +38,7 @@ def set_parser():

try:
readGet.download_files(accession, utils.SUBMITTED_FORMAT, dest_dir, False, fetch_meta)
print ('Download completed')
print ('Completed')
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python3/assemblyGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def set_parser():
help='Destination directory (default is current running directory)')
parser.add_argument('-w', '--wgs', action='store_true',
help='Download WGS set if available (default is false)')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def check_format(format):
Expand Down Expand Up @@ -133,7 +133,7 @@ def download_assembly(dest_dir, accession, format, fetch_wgs, quiet=False):

try:
download_assembly(dest_dir, accession, format, fetch_wgs)
print ('Download completed')
print ('Completed')
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python3/enaDataGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options. """)
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser


Expand Down Expand Up @@ -73,7 +73,7 @@ def set_parser():
else:
print ('Error: Invalid accession provided')
sys.exit(1)
print ('Download completed')
print ('Completed')
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python3/enaGroupGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options. """)
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def download_report(group, result, accession, temp_file):
Expand Down Expand Up @@ -146,7 +146,7 @@ def download_group(accession, group, format, dest_dir, fetch_wgs, fetch_meta, fe

try:
download_group(accession, group, format, dest_dir, fetch_wgs, fetch_meta, fetch_index)
print ('Download completed')
print ('Completed')
except Exception:
utils.print_error()
sys.exit(1)
4 changes: 2 additions & 2 deletions python3/readGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def set_parser():
parser.add_argument('-i', '--index', action='store_true',
help="""Download CRAM index files with submitted CRAM files, if any (default is false).
This flag is ignored for fastq and sra format options""")
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.1')
return parser

def check_read_format(format):
Expand Down Expand Up @@ -132,7 +132,7 @@ def download_files(accession, format, dest_dir, fetch_index, fetch_meta):

try:
download_files(accession, format, dest_dir, fetch_index, fetch_meta)
print ('Download completed')
print ('Completed')
except Exception:
utils.print_error()
sys.exit(1)
Loading

0 comments on commit 9d37269

Please sign in to comment.