Skip to content

Commit

Permalink
Added expanded flag (to expand CON embl format records)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicsilvester committed May 10, 2018
1 parent 437a268 commit 9341ba1
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 56 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### Mac ###
.DS_Store
**/.DS_Store

### compiled python ###
python/*.pyc
python3/__pycache__
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ optional arguments:
(default is false)
-e, --extract-wgs Extract WGS scaffolds for each assembly if available
(default is false)
-exp, --expanded Expand CON scaffolds when downloading embl format
(default is false)
-m, --meta Download read or analysis XML in addition to data
files (default is false)
-i, --index Download CRAM index files with submitted CRAM files,
Expand Down Expand Up @@ -215,6 +217,8 @@ optional arguments:
(default is false)
-e, --extract-wgs Extract WGS scaffolds for each assembly if available
(default is false)
-exp, --expanded Expand CON scaffolds when downloading embl format
(default is false)
-m, --meta Download read or analysis XML in addition to data
files (default is false)
-i, --index Download CRAM index files with submitted CRAM files,
Expand Down
20 changes: 10 additions & 10 deletions python/assemblyGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def extract_wgs_sequences(accession_list):
other_sequences = [a for a in accession_list if not utils.is_wgs_sequence(a)]
return wgs_sequences, other_sequences

def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, quiet):
def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, expanded, quiet):
failed_accessions = []
count = 0
sequence_cnt = len(accession_list)
Expand All @@ -86,7 +86,7 @@ def download_sequence_set(accession_list, mol_type, assembly_dir, output_format,
target_file_path = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
target_file = open(target_file_path, 'w')
for accession in accession_list:
success = sequenceGet.write_record(target_file, accession, output_format)
success = sequenceGet.write_record(target_file, accession, output_format, expanded)
if not success:
failed_accessions.append(accession)
else:
Expand All @@ -102,15 +102,15 @@ def download_sequence_set(accession_list, mol_type, assembly_dir, output_format,
print 'Failed to fetch following {0}, format {1}'.format(mol_type, output_format)
print ','.join(failed_accessions)

def download_sequences(sequence_report, assembly_dir, output_format, quiet):
def download_sequences(sequence_report, assembly_dir, output_format, expanded, quiet):
local_sequence_report = os.path.join(assembly_dir, sequence_report)
replicon_list, unlocalised_list, unplaced_list, patch_list = parse_sequence_report(local_sequence_report)
wgs_scaffolds, other_unlocalised = _sequences(unlocalised_list)
wgs_scaffolds, other_unlocalised = extract_wgs_sequences(unlocalised_list)
wgs_unplaced, other_unplaced = extract_wgs_sequences(unplaced_list)
download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, quiet)
download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, quiet)
download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, quiet)
download_sequence_set(patch_list, PATCH, assembly_dir, output_format, quiet)
download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, expanded, quiet)
download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, expanded, quiet)
download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, expanded, quiet)
download_sequence_set(patch_list, PATCH, assembly_dir, output_format, expanded, quiet)
wgs_scaffolds.extend(wgs_unplaced)
return wgs_scaffolds

Expand All @@ -135,7 +135,7 @@ def extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, q
target_file.flush()
target_file.close()

def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, quiet=False):
def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded, quiet=False):
if output_format is None:
output_format = utils.EMBL_FORMAT
assembly_dir = os.path.join(dest_dir, accession)
Expand All @@ -153,7 +153,7 @@ def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs
wgs_scaffolds = []
wgs_scaffold_cnt = 0
if has_sequence_report:
wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, quiet)
wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, expanded, quiet)
wgs_scaffold_cnt = len(wgs_scaffolds)
if wgs_scaffold_cnt > 0:
if not quiet:
Expand Down
7 changes: 5 additions & 2 deletions python/enaDataGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def set_parser():
help='Download WGS set for each assembly if available (default is false)')
parser.add_argument('-e', '--extract-wgs', action='store_true',
help='Extract WGS scaffolds for each assembly if available (default is false)')
parser.add_argument('-exp', '--expanded', action='store_true',
help='Expand CON scaffolds when downloading embl format (default is false)')
parser.add_argument('-m', '--meta', action='store_true',
help='Download read or analysis XML in addition to data files (default is false)')
parser.add_argument('-i', '--index', action='store_true',
Expand All @@ -65,6 +67,7 @@ def set_parser():
dest_dir = args.dest
fetch_wgs = args.wgs
extract_wgs = args.extract_wgs
expanded = args.expanded
fetch_meta = args.meta
fetch_index = args.index
aspera = args.aspera
Expand All @@ -84,7 +87,7 @@ def set_parser():
elif utils.is_sequence(accession):
if output_format is not None:
sequenceGet.check_format(output_format)
sequenceGet.download_sequence(dest_dir, accession, output_format)
sequenceGet.download_sequence(dest_dir, accession, output_format, expanded)
elif utils.is_analysis(accession):
if output_format is not None:
readGet.check_read_format(output_format)
Expand All @@ -96,7 +99,7 @@ def set_parser():
elif utils.is_assembly(accession):
if output_format is not None:
assemblyGet.check_format(output_format)
assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs)
assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded)
else:
sys.stderr.write('ERROR: Invalid accession provided\n')
sys.exit(1)
Expand Down
29 changes: 16 additions & 13 deletions python/enaGroupGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def set_parser():
help='Download WGS set for each assembly if available (default is false)')
parser.add_argument('-e', '--extract-wgs', action='store_true',
help='Extract WGS scaffolds for each assembly if available (default is false)')
parser.add_argument('-exp', '--expanded', action='store_true',
help='Expand CON scaffolds when downloading embl format (default is false)')
parser.add_argument('-m', '--meta', action='store_true',
help='Download read or analysis XML in addition to data files (default is false)')
parser.add_argument('-i', '--index', action='store_true',
Expand All @@ -68,18 +70,18 @@ def download_report(group, result, accession, temp_file, subtree):
f.flush()
f.close()

def download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera):
def download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, expanded, fetch_meta, fetch_index, aspera):
if group == utils.WGS:
print 'Fetching ' + data_accession[:6]
sequenceGet.download_wgs(group_dir, data_accession[:6], output_format)
else:
print 'Fetching ' + data_accession
if group == utils.ASSEMBLY:
assemblyGet.download_assembly(group_dir, data_accession, output_format, fetch_wgs, extract_wgs, True)
assemblyGet.download_assembly(group_dir, data_accession, output_format, fetch_wgs, extract_wgs, expanded, True)
elif group in [utils.READ, utils.ANALYSIS]:
readGet.download_files(data_accession, output_format, group_dir, fetch_index, fetch_meta, aspera)

def download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree):
def download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded):
temp_file_path = os.path.join(group_dir, accession + '_temp.txt')
download_report(group, utils.get_group_result(group), accession, temp_file_path, subtree)
header = True
Expand All @@ -89,10 +91,10 @@ def download_data_group(group, accession, output_format, group_dir, fetch_wgs, e
header = False
continue
data_accession = line.strip()
download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera)
download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, expanded, fetch_meta, fetch_index, aspera)
os.remove(temp_file_path)

def download_sequence_result(dest_file, group_dir, result, accession, subtree, update_accs):
def download_sequence_result(dest_file, group_dir, result, accession, subtree, update_accs, expanded):
temp_file_path = os.path.join(group_dir, 'temp.txt')
download_report(utils.SEQUENCE, result, accession, temp_file_path, subtree)
header = True
Expand All @@ -110,29 +112,29 @@ def download_sequence_result(dest_file, group_dir, result, accession, subtree, u
if data_accession not in update_accs:
write_record = True
if write_record:
sequenceGet.write_record(dest_file, data_accession, output_format)
sequenceGet.write_record(dest_file, data_accession, output_format, expanded)
dest_file.flush()
os.remove(temp_file_path)
return update_accs

def download_sequence_group(accession, output_format, group_dir, subtree):
def download_sequence_group(accession, output_format, group_dir, subtree, expanded):
print 'Downloading sequences'
update_accs = []
dest_file_path = os.path.join(group_dir, utils.get_filename(accession + '_sequences', output_format))
dest_file = open(dest_file_path, 'w')
#sequence update
update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_UPDATE_RESULT, accession, subtree, update_accs)
update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_UPDATE_RESULT, accession, subtree, update_accs, expanded)
#sequence release
update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_RELEASE_RESULT, accession, subtree)
update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_RELEASE_RESULT, accession, subtree, update_accs, expanded)
dest_file.close()

def download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree):
def download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded):
group_dir = os.path.join(dest_dir, accession)
utils.create_dir(group_dir)
if group == utils.SEQUENCE:
download_sequence_group(accession, output_format, group_dir, subtree)
download_sequence_group(accession, output_format, group_dir, subtree, expanded)
else:
download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree)
download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded)


if __name__ == '__main__':
Expand All @@ -145,6 +147,7 @@ def download_group(accession, group, output_format, dest_dir, fetch_wgs, extract
dest_dir = args.dest
fetch_wgs = args.wgs
extract_wgs = args.extract_wgs
expanded = args.expanded
fetch_meta = args.meta
fetch_index = args.index
aspera = args.aspera
Expand Down Expand Up @@ -182,7 +185,7 @@ def download_group(accession, group, output_format, dest_dir, fetch_wgs, extract
if utils.is_taxid(accession) and group in ['read', 'analysis']:
print 'Sorry, tax ID retrieval not yet supported for read and analysis'
sys.exit(1)
download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree)
download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded)
print 'Completed'
except Exception:
utils.print_error()
Expand Down
8 changes: 5 additions & 3 deletions python/sequenceGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@

import utils

def write_record(dest_file, accession, output_format):
def write_record(dest_file, accession, output_format, expanded=False):
url = utils.get_record_url(accession, output_format)
if expanded:
url = url + '&expanded=true'
return utils.write_record(url, dest_file)

def download_sequence(dest_dir, accession, output_format):
def download_sequence(dest_dir, accession, output_format, expanded):
if output_format is None:
output_format = utils.EMBL_FORMAT
success = utils.download_record(dest_dir, accession, output_format)
success = utils.download_record(dest_dir, accession, output_format, expanded)
if not success:
print 'Unable to fetch file for {0}, format {1}'.format(accession, output_format)
return success
Expand Down
4 changes: 3 additions & 1 deletion python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,12 @@ def get_destination_file(dest_dir, accession, output_format):
def download_single_record(url, dest_file):
urllib.urlretrieve(url, dest_file)

def download_record(dest_dir, accession, output_format):
def download_record(dest_dir, accession, output_format, expanded=False):
try:
dest_file = get_destination_file(dest_dir, accession, output_format)
url = get_record_url(accession, output_format)
if (expanded):
url = url + '&expanded=true'
download_single_record(url, dest_file)
return True
except Exception:
Expand Down
18 changes: 9 additions & 9 deletions python3/assemblyGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def extract_wgs_sequences(accession_list):
other_sequences = [a for a in accession_list if not utils.is_wgs_sequence(a)]
return wgs_sequences, other_sequences

def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, quiet):
def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, expanded, quiet):
failed_accessions = []
count = 0
sequence_cnt = len(accession_list)
Expand All @@ -87,7 +87,7 @@ def download_sequence_set(accession_list, mol_type, assembly_dir, output_format,
target_file_path = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
target_file = open(target_file_path, 'wb')
for accession in accession_list:
success = sequenceGet.write_record(target_file, accession, output_format)
success = sequenceGet.write_record(target_file, accession, output_format, expanded)
if not success:
failed_accessions.append(accession)
else:
Expand All @@ -103,15 +103,15 @@ def download_sequence_set(accession_list, mol_type, assembly_dir, output_format,
print ('Failed to fetch following {0}, format {1}'.format(mol_type, output_format))
print (','.join(failed_accessions))

def download_sequences(sequence_report, assembly_dir, output_format, quiet):
def download_sequences(sequence_report, assembly_dir, output_format, expanded, quiet):
local_sequence_report = os.path.join(assembly_dir, sequence_report)
replicon_list, unlocalised_list, unplaced_list, patch_list = parse_sequence_report(local_sequence_report)
wgs_scaffolds, other_unlocalised = extract_wgs_sequences(unlocalised_list)
wgs_unplaced, other_unplaced = extract_wgs_sequences(unplaced_list)
download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, quiet)
download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, quiet)
download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, quiet)
download_sequence_set(patch_list, PATCH, assembly_dir, output_format, quiet)
download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, expanded, quiet)
download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, expanded, quiet)
download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, expanded, quiet)
download_sequence_set(patch_list, PATCH, assembly_dir, output_format, expanded, quiet)
wgs_scaffolds.extend(wgs_unplaced)
return wgs_scaffolds

Expand All @@ -136,7 +136,7 @@ def extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, q
target_file.flush()
target_file.close()

def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, quiet=False):
def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded, quiet=False):
if output_format is None:
output_format = utils.EMBL_FORMAT
assembly_dir = os.path.join(dest_dir, accession)
Expand All @@ -154,7 +154,7 @@ def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs
wgs_scaffolds = []
wgs_scaffold_cnt = 0
if has_sequence_report:
wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, quiet)
wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, expanded, quiet)
wgs_scaffold_cnt = len(wgs_scaffolds)
if wgs_scaffold_cnt > 0:
if not quiet:
Expand Down
7 changes: 5 additions & 2 deletions python3/enaDataGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def set_parser():
help='Download WGS set for each assembly if available (default is false)')
parser.add_argument('-e', '--extract-wgs', action='store_true',
help='Extract WGS scaffolds for each assembly if available (default is false)')
parser.add_argument('-exp', '--expanded', action='store_true',
help='Expand CON scaffolds when downloading embl format (default is false)')
parser.add_argument('-m', '--meta', action='store_true',
help='Download read or analysis XML in addition to data files (default is false)')
parser.add_argument('-i', '--index', action='store_true',
Expand All @@ -65,6 +67,7 @@ def set_parser():
dest_dir = args.dest
fetch_wgs = args.wgs
extract_wgs = args.extract_wgs
expanded = args.expanded
fetch_meta = args.meta
fetch_index = args.index
aspera = args.aspera
Expand All @@ -84,7 +87,7 @@ def set_parser():
elif utils.is_sequence(accession):
if output_format is not None:
sequenceGet.check_format(output_format)
sequenceGet.download_sequence(dest_dir, accession, output_format)
sequenceGet.download_sequence(dest_dir, accession, output_format, expanded)
elif utils.is_analysis(accession):
if output_format is not None:
readGet.check_read_format(output_format)
Expand All @@ -96,7 +99,7 @@ def set_parser():
elif utils.is_assembly(accession):
if output_format is not None:
assemblyGet.check_format(output_format)
assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs)
assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded)
else:
sys.stderr.write('ERROR: Invalid accession provided\n')
sys.exit(1)
Expand Down
Loading

0 comments on commit 9341ba1

Please sign in to comment.