v1.5 release

Improved checks for Aspera files and binary Handle known Python 3 Mac installation problem impacting HTTPS URLs Don't re-download run/analysis file if exists (with MD5 check) Replace (rather than append to) sequence files if they exist Download WGS set if WGS-only assembly or assembly contains WGS scaffolds (even without WGS flag) Added flag to extract WGS scaffolds from WGS file into wgs_scaffolds file Added some progress information to enaDataGet assembly fetch
enasequence · May 8, 2018 · 00b804e · 00b804e
1 parent 743cd2f
commit 00b804e
Show file tree

Hide file tree

Showing 15 changed files with 358 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -17,6 +17,12 @@ Both Python 2 and Python 3 scripts are available.  The Python 2 scripts can be f
 
 To run these scripts you will need to have Python installed.  You can download either Python 2 or Python 3 from [here](https://www.python.org/downloads/). If you already have Python installed, you can find out which version when you start the python interpreter.  If using Python 2, we suggest you upgrade to the latest version if you don't already have it: 2.7.
 
+Note that EBI now uses HTTPS servers. This can create a problem when using Python 3 on a Mac due to an oft-missed
+installation step. Please run the "Install Certificates.command" command to ensure your Python 3 installation on
+the Mac can correctly authenticate against the servers. To do this, run the following from a terminal window, updating
+the Python version with the correct version of Python 3 that you have installed:
+open "/Applications/Python 3.6/Install Certificates.command"
+
 ## Installing and running the scripts
 
 Download the [latest release](https://github.com/enasequence/enaBrowserTools/releases/latest) and extract it to the preferred location on your computer. You will now have the enaBrowserTools folder, containing both the python 2 and 3 option scripts.  If you are using a Unix/Linux or Mac computer, we suggest you add the following aliases to your .bashrc or .bash_profile file. Where INSTALLATION_DIR is the location where you have saved the enaBrowserTools to and PYTHON_CHOICE will depend on whether you are using the Python 2 or Python 3 scripts.
@@ -107,12 +113,14 @@ optional arguments:
                         File format required. Format requested must be
                         permitted for data type selected. sequence, assembly
                         and wgs accessions: embl(default) and fasta formats.
-                        read group: submitted, fastq and sra
-                        formats. analysis group: submitted only.
+                        read group: submitted, fastq and sra formats. analysis
+                        group: submitted only.
   -d DEST, --dest DEST  Destination directory (default is current running
                         directory)
   -w, --wgs             Download WGS set for each assembly if available
                         (default is false)
+  -e, --extract-wgs     Extract WGS scaffolds for each assembly if available
+                        (default is false)
   -m, --meta            Download read or analysis XML in addition to data
                         files (default is false)
   -i, --index           Download CRAM index files with submitted CRAM files,
@@ -141,7 +149,7 @@ usage: enaGroupGet [-h] [-g {sequence,wgs,assembly,read,analysis}]
                    [-i] [-a] [-as ASPERA_SETTINGS] [-t] [-v]
                    accession
 
-Download data for a given study or sample
+Download data for a given study or sample, or (for sequence and assembly) taxon
 
 positional arguments:
   accession             Study or sample accession or NCBI tax ID to fetch data
@@ -162,6 +170,8 @@ optional arguments:
                         directory)
   -w, --wgs             Download WGS set for each assembly if available
                         (default is false)
+  -e, --extract-wgs     Extract WGS scaffolds for each assembly if available
+                        (default is false)
   -m, --meta            Download read or analysis XML in addition to data
                         files (default is false)
   -i, --index           Download CRAM index files with submitted CRAM files,
@@ -180,10 +190,10 @@ optional arguments:
 
 # Tips
 
-From version 1.4, when downloading read data if you use the default format (that is, don't use the format option), the scripts will look for available files in the following priority: submitted, sra, fastq. 
+From version 1.4, when downloading read data if you use the default format (that is, don't use the format option), the scripts will look for available files in the following priority: submitted, sra, fastq.
 
 A word of advice for read formats:
-- submitted: only read data submitted to ENA have files available as submitted by the user. 
+- submitted: only read data submitted to ENA have files available as submitted by the user.
 - sra:  this is the NCBI SRA format, and is the format in which all NCBI/DDBJ data is mirrored to ENA.
 - fastq:  not all submitted format files can be converted to FASTQ
 

diff --git a/python/assemblyGet.py b/python/assemblyGet.py
@@ -20,6 +20,7 @@
 import os
 import sys
 import argparse
+import gzip
 import xml.etree.ElementTree as ElementTree
 
 import utils
@@ -31,7 +32,7 @@
 PATCH = 'patch'
 
 def check_format(output_format):
-    if format not in [utils.EMBL_FORMAT, utils.FASTA_FORMAT]:
+    if output_format not in [utils.EMBL_FORMAT, utils.FASTA_FORMAT]:
         sys.stderr.write(
             'ERROR: Invalid format. Please select a valid format for this accession: {0}\n'.format([utils.EMBL_FORMAT, utils.FASTA_FORMAT])
         )
@@ -69,31 +70,72 @@ def parse_sequence_report(local_sequence_report):
     patch_list = [l.split('\t')[0] for l in lines[1:] if PATCH in l.split('\t')[3]]
     return (replicon_list, unlocalised_list, unplaced_list, patch_list)
 
+def extract_wgs_sequences(accession_list):
+    wgs_sequences = [a for a in accession_list if utils.is_wgs_sequence(a)]
+    other_sequences = [a for a in accession_list if not utils.is_wgs_sequence(a)]
+    return wgs_sequences, other_sequences
+
 def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, quiet):
     failed_accessions = []
-    if len(accession_list) > 0:
+    count = 0
+    sequence_cnt = len(accession_list)
+    divisor = utils.get_divisor(sequence_cnt)
+    if sequence_cnt > 0:
         if not quiet:
             print 'fetching sequences: ' + mol_type
-        target_file = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
+        target_file_path = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
+        target_file = open(target_file_path, 'w')
         for accession in accession_list:
-            success = sequenceGet.append_record(target_file, accession, output_format)
+            success = sequenceGet.write_record(target_file, accession, output_format)
             if not success:
                 failed_accessions.append(accession)
+            else:
+                count += 1
+                if count % divisor == 0 and not quiet:
+                    print 'downloaded {0} of {1} sequences'.format(count, sequence_cnt)
+        if not quiet:
+            print 'downloaded {0} of {1} sequences'.format(count, sequence_cnt)
+        target_file.close()
     elif not quiet:
         print 'no sequences: ' + mol_type
     if len(failed_accessions) > 0:
-        print 'Failed to fetch following ' + mol_type + ', format ' + output_format
-        print failed_accessions.join(',')
+        print 'Failed to fetch following {0}, format {1}'.format(mol_type, output_format)
+        print ','.join(failed_accessions)
 
 def download_sequences(sequence_report, assembly_dir, output_format, quiet):
     local_sequence_report = os.path.join(assembly_dir, sequence_report)
     replicon_list, unlocalised_list, unplaced_list, patch_list = parse_sequence_report(local_sequence_report)
+    wgs_scaffolds, other_unlocalised = _sequences(unlocalised_list)
+    wgs_unplaced, other_unplaced = extract_wgs_sequences(unplaced_list)
     download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, quiet)
-    download_sequence_set(unlocalised_list, UNLOCALISED, assembly_dir, output_format, quiet)
-    download_sequence_set(unplaced_list, UNPLACED, assembly_dir, output_format, quiet)
+    download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, quiet)
+    download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, quiet)
     download_sequence_set(patch_list, PATCH, assembly_dir, output_format, quiet)
+    wgs_scaffolds.extend(wgs_unplaced)
+    return wgs_scaffolds
 
-def download_assembly(dest_dir, accession, output_format, fetch_wgs, quiet=False):
+def extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, quiet):
+    if not quiet:
+        print 'extracting {0} WGS scaffolds from WGS set file'.format(len(wgs_scaffolds))
+    accs = [a.split('.')[0] for a in wgs_scaffolds]
+    wgs_file_path = os.path.join(assembly_dir, wgs_set + utils.get_wgs_file_ext(output_format))
+    target_file_path = os.path.join(assembly_dir, utils.get_filename('wgs_scaffolds', output_format))
+    write_line = False
+    target_file = open(target_file_path, 'w')
+    with gzip.open(wgs_file_path, 'rb') as f:
+        for line in f:
+            if utils.record_start_line(line, output_format):
+                if utils.extract_acc_from_line(line, output_format) in accs:
+                    write_line = True
+                else:
+                    write_line = False
+                    target_file.flush()
+            if write_line:
+                target_file.write(line)
+    target_file.flush()
+    target_file.close()
+
+def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, quiet=False):
     if output_format is None:
         output_format = utils.EMBL_FORMAT
     assembly_dir = os.path.join(dest_dir, accession)
@@ -107,11 +149,23 @@ def download_assembly(dest_dir, accession, output_format, fetch_wgs, quiet=False
     # download sequence report
     if sequence_report is not None:
         has_sequence_report = utils.get_ftp_file(sequence_report, assembly_dir)
+    # parse sequence report and download sequences
+    wgs_scaffolds = []
+    wgs_scaffold_cnt = 0
+    if has_sequence_report:
+        wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, quiet)
+        wgs_scaffold_cnt = len(wgs_scaffolds)
+        if wgs_scaffold_cnt > 0:
+            if not quiet:
+                print 'Assembly contains {} WGS scaffolds, will fetch WGS set'.format(wgs_scaffold_cnt)
+            fetch_wgs = True
+    else:
+        fetch_wgs = True
     # download wgs set if needed
     if wgs_set is not None and fetch_wgs:
         if not quiet:
             print 'fetching wgs set'
         sequenceGet.download_wgs(assembly_dir, wgs_set, output_format)
-    # parse sequence report and download sequences
-    if has_sequence_report:
-        download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, quiet)
+        # extract wgs scaffolds from WGS file
+        if wgs_scaffold_cnt > 0 and extract_wgs:
+            extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, quiet)
diff --git a/python/assemblyGet.pyc b/python/assemblyGet.pyc
diff --git a/python/enaDataGet.py b/python/enaDataGet.py
@@ -40,6 +40,8 @@ def set_parser():
                         help='Destination directory (default is current running directory)')
     parser.add_argument('-w', '--wgs', action='store_true',
                         help='Download WGS set for each assembly if available (default is false)')
+    parser.add_argument('-e', '--extract-wgs', action='store_true',
+                        help='Extract WGS scaffolds for each assembly if available (default is false)')
     parser.add_argument('-m', '--meta', action='store_true',
                         help='Download read or analysis XML in addition to data files (default is false)')
     parser.add_argument('-i', '--index', action='store_true',
@@ -50,7 +52,7 @@ def set_parser():
     parser.add_argument('-as', '--aspera-settings', default=None,
                     help="""Use the provided settings file, will otherwise check
                         for environment variable or default settings file location.""")
-    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.4.1')
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.5')
     return parser
 
 
@@ -62,6 +64,7 @@ def set_parser():
     output_format = args.format
     dest_dir = args.dest
     fetch_wgs = args.wgs
+    extract_wgs = args.extract_wgs
     fetch_meta = args.meta
     fetch_index = args.index
     aspera = args.aspera
@@ -93,7 +96,7 @@ def set_parser():
         elif utils.is_assembly(accession):
             if output_format is not None:
                 assemblyGet.check_format(output_format)
-            assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs)
+            assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs)
         else:
             sys.stderr.write('ERROR: Invalid accession provided\n')
             sys.exit(1)