Merge pull request #49 from NBISweden/py3

Converted to python3
NBISweden · Feb 20, 2020 · 778e706 · 778e706
2 parents 3bba80e + 2da5794
commit 778e706
Show file tree

Hide file tree

Showing 20 changed files with 14,459 additions and 14,444 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,6 @@
 language: python
 python:
-    - "2.7"
+    - 3.5
+    - 3.7
 script:
     - t/test.sh
diff --git a/EMBLmyGFF3/EMBLmyGFF3.py b/EMBLmyGFF3/EMBLmyGFF3.py
@@ -1,4 +1,25 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
+
+from .modules.utilities import *
+from .modules.feature import Feature
+from .modules.help import Help
+from EMBLmyGFF3.version import __version__
+
+
+from Bio import SeqIO, Entrez
+from BCBio import GFF
+from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
+
+import os
+import sys
+import gzip
+import pprint
+import time
+import shutil
+import logging
+import argparse
+import re
+
 """
 EMBL writer for ENA data submission. Note that this implementation is basically
 just the documentation at ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
@@ -9,11 +30,12 @@
 
 shameless_plug="""
     #############################################################################
-    # NBIS 2019 - Sweden                                                        #
+    # EMBLmyGFF3 v{str1}                                                             #
+    # NBIS - National Bioinformatics Infrastructure Sweden                      #
     # Authors: Martin Norling, Niclas Jareborg, Jacques Dainat                  #
     # Please visit https://github.com/NBISweden/EMBLmyGFF3 for more information #
     #############################################################################
-\n"""
+\n""".format(str1=__version__)
 
 TODO="""
 TODO: find list of previous ENA release dates and numbers
@@ -22,22 +44,6 @@
 TODO: add way to handle mandatory features and feature qualifiers (especially contingent dependencies)
 """
 
-import os
-import sys
-import gzip
-import pprint
-import time
-import shutil
-import logging
-import argparse
-import re
-from modules.utilities import *
-from Bio import SeqIO, Entrez
-from BCBio import GFF
-from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
-from modules.feature import Feature
-from modules.help import Help
-
 SCRIPT_DIR=os.path.dirname(__file__)
 FEATURE_DIR=SCRIPT_DIR + "/modules/features"
 QUALIFIER_DIR=SCRIPT_DIR + "/modules/qualifiers"
@@ -262,7 +268,7 @@ def _verify(self, key, key_type):
                 sys.stderr.write("\n'%s' is not a legal value for %s.\n" % (key, key_type))
             sys.stderr.write("Legal values are:\n")
             if type(self.legal_values[key_type]) == type({}):
-                for value, description in self.legal_values[key_type].iteritems():
+                for value, description in self.legal_values[key_type].items():
                     sys.stderr.write("  - %s\t%s\n" % (value, description))
             else:
                 for value in self.legal_values[key_type]:
@@ -271,7 +277,7 @@ def _verify(self, key, key_type):
                 sys.stderr.write("Please enter a value: ")
             else:
                 sys.stderr.write("Please enter new value: ")
-            key = raw_input()
+            key = input()
             if key.isdigit(): key = int(key)
 
         return key
@@ -289,7 +295,7 @@ def _verify_locus_tag(self, locus_tag):
 
             if not locus_tag:
                 sys.stderr.write("No value provided as locus_tag.\nPlease provide a locus_tag (A default XXX locus_tag will be set up if none provided):")
-                locus_tag = raw_input()
+                locus_tag = input()
                 if not locus_tag:
                     checked_locus_tag="XXX"
                     break
@@ -314,7 +320,7 @@ def _verify_locus_tag(self, locus_tag):
 
             if not checked_locus_tag:
                 sys.stderr.write("Please provide a locus_tag (A default XXX locus_tag will be set up if none provided):")
-                locus_tag = raw_input()
+                locus_tag = input()
                 if not locus_tag:
                     checked_locus_tag="XXX"
                     break
@@ -393,7 +399,7 @@ def print_progress(clear = False):
 
     def handle_message(self, type, msg_type, msg, value):
 
-        if EMBL.PREVIOUS_ERRORS.has_key(msg_type):
+        if msg_type in EMBL.PREVIOUS_ERRORS:
             EMBL.PREVIOUS_ERRORS[msg_type] += 1
 
         level = eval("logging.%s" % type.upper())
@@ -403,7 +409,7 @@ def handle_message(self, type, msg_type, msg, value):
         else:
             if not value:   # number of line accepted to display (defaut or given to the method)
                 value = 5
-            if not EMBL.PREVIOUS_ERRORS.has_key(msg_type) or EMBL.PREVIOUS_ERRORS[msg_type] < value:
+            if msg_type not in EMBL.PREVIOUS_ERRORS or EMBL.PREVIOUS_ERRORS[msg_type] < value:
                 logging.log(level, msg)
                 EMBL.PREVIOUS_ERRORS.setdefault(msg_type,1)
             elif EMBL.PREVIOUS_ERRORS[msg_type] == value:
@@ -860,27 +866,27 @@ def set_classification(self, classification = None, strain = None, environmental
                                          "when organism belongs to Bacteria. Please fill one of those information.(source feature keys containing "\
                                          "the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\
                                          "/environmental_sample must not include the /strain qualifier)\nStrain:")
-                        strain = raw_input()
+                        strain = input()
                         if strain:
                             EMBL.PREVIOUS_VALUES["strain"]=strain
                             onekey = strain
                         if not strain: #Entry with /environmental_sample must not include the /strain qualifier
                             environmental_sample = None
                             while environmental_sample != "n" and environmental_sample != "y" :
                                 sys.stderr.write("Environmental_sample [y/n]:")
-                                environmental_sample = raw_input()
+                                environmental_sample = input()
                                 if environmental_sample == "y":
                                     EMBL.PREVIOUS_VALUES["environmental_sample"]=None
                                     onekey = environmental_sample
                                     isolation_source=None
                                     sys.stderr.write("/environmental_sample qualifier should also contain the /isolation_source qualifier.")
                                     while not isolation_source: #/environmental_sample qualifier should also contain the /isolation_source qualifier
                                         sys.stderr.write("isolation_source:")
-                                        isolation_source = raw_input()
+                                        isolation_source = input()
                                     EMBL.PREVIOUS_VALUES["isolation_source"]=isolation_source
 
                         sys.stderr.write("isolate:")
-                        isolate = raw_input()
+                        isolate = input()
                         if isolate:
                             EMBL.PREVIOUS_VALUES["isolate"]=isolate
                             onekey = isolate
@@ -900,7 +906,7 @@ def set_classification(self, classification = None, strain = None, environmental
                             sys.stderr.write("/environmental_sample qualifier should also contain the /isolation_source qualifier.\n")
                             while not isolation_source: #/environmental_sample qualifier should also contain the /isolation_source qualifier
                                 sys.stderr.write("isolation_source:")
-                                isolation_source = raw_input()
+                                isolation_source = input()
                             EMBL.PREVIOUS_VALUES["isolation_source"]=isolation_source
 
 
@@ -1067,7 +1073,7 @@ def set_project_id(self, project_id = None):
         else:
             if not project_id:
                 sys.stderr.write("No project_id provided.\nPlease provide a project ID:")
-                project_id = raw_input()
+                project_id = input()
                 if not project_id:
                     project_id = "XXX"
 
@@ -1096,7 +1102,7 @@ def set_species(self, species = None):
             else:
                 while not species:
                     sys.stderr.write("No value provided for species.\nPlease provide the scientific name or taxid of the organism:")
-                    species = raw_input()
+                    species = input()
 
                 species = self.get_species_from_taxid(species)
                 self.species = species
@@ -1313,17 +1319,17 @@ def main():
         if args.gzip:
             if not outfile.endswith(".embl.gz"):
                 outfile += ".gz" if outfile.endswith(".embl") else ".embl.gz"
-            outfile = gzip.open(outfile, "wb")
+            outfile = gzip.open(outfile, "wt")
         else:
             if not outfile.endswith(".embl"):
                 outfile += ".embl"
-            outfile = open(outfile, "wb")
+            outfile = open(outfile, "w")
     else:
         outfile = sys.stdout
 
     logging.info("Reading sequence file")
-    infile = gzip.open(args.gff_file) if args.gff_file.endswith(".gz") else open(args.gff_file)
-    infasta = gzip.open(args.fasta) if args.fasta.endswith(".gz") else open(args.fasta)
+    infile = gzip.open(args.gff_file, 'rt') if args.gff_file.endswith(".gz") else open(args.gff_file)
+    infasta = gzip.open(args.fasta, 'rt') if args.fasta.endswith(".gz") else open(args.fasta)
     seq_dict = SeqIO.to_dict( SeqIO.parse(infasta, "fasta") )
     logging.info("Finished reading sequence file.")
 

diff --git a/EMBLmyGFF3/__init__.py b/EMBLmyGFF3/__init__.py
@@ -1,3 +1,3 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 
-from EMBLmyGFF3 import *
+from .EMBLmyGFF3 import *
diff --git a/EMBLmyGFF3/__main__.py b/EMBLmyGFF3/__main__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 
-from EMBLmyGFF3 import main
+from .EMBLmyGFF3 import main
 main()
diff --git a/EMBLmyGFF3/modules/feature.py b/EMBLmyGFF3/modules/feature.py
@@ -1,20 +1,20 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-from __future__ import division
+from .qualifier import *
+from .location import EMBLLocation
+from .utilities import *
+
+from Bio.Seq import Seq
+from Bio.Data import CodonTable
+from Bio.Alphabet.IUPAC import *
+from Bio.SeqFeature import SeqFeature, FeatureLocation, BeforePosition, AfterPosition
 
 import os
 import sys
 import json
 import logging
-from utilities import *
 from operator import attrgetter
-from Bio.Seq import Seq
-from Bio.Data import CodonTable
-from Bio.Alphabet.IUPAC import *
-from location import EMBLLocation
-from Bio.SeqFeature import SeqFeature, FeatureLocation, BeforePosition, AfterPosition
-from qualifier import *
 
 def chunk_format(string, chunk_string = None, offset = 0, chunk_size = 3, chunks_per_line = 30, indent = 6):
     offset = offset%chunk_size
@@ -142,7 +142,7 @@ def __repr__(self):
         Formats the feature as EMBL, limited to 80 character lines,
         including sub features.
         """
-        output=unicode("")
+        output=str("")
 
         if self.skip_feature is False or self.force_unknown_features or self.force_uncomplete_features:
             output = self._feature_as_EMBL(self.no_wrap_qualifier) if self.type not in self.remove else ""
@@ -202,9 +202,13 @@ def _feature_as_EMBL(self, no_wrap_qualifier):
         output = multiline("FT", string, featureType=self.type, wrap=59, split_char=",")
 
         # Print qualifiers for the feature
-        for qualifier in self.qualifiers.values():
-            if qualifier.value:
-                output += qualifier.embl_format(no_wrap_qualifier)
+        for qualifier in sorted(self.qualifiers): # sort by qualifier name
+
+            # continue if qualifier has a value
+            if self.qualifiers[qualifier].value:
+                # sort by value
+                self.qualifiers[qualifier].value = sorted(self.qualifiers[qualifier].value)
+                output += self.qualifiers[qualifier].embl_format(no_wrap_qualifier)
 
         return output
 
@@ -273,7 +277,7 @@ def _infer_ORFs(self, feature):
             sub_feature._infer_ORFs(feature)
 
     def _check_qualifier(self, feature):
-        for qualifier, value in self.qualifiers.iteritems():
+        for qualifier, value in self.qualifiers.items():
 
             # Check presence of mandatory qualifier
             if self.qualifiers[qualifier].mandatory:# Check if the qualifier is mandatory
@@ -288,7 +292,7 @@ def _load_data(self, feature, accessions):
         """
         Parses a GFF feature and stores the data in the current Feature
         """
-        for qualifier, value in feature.qualifiers.iteritems():
+        for qualifier, value in feature.qualifiers.items():
             logging.debug("Reading qualifier: %s (%s), translating to %s" % (qualifier, value, self._from_gff_qualifier(qualifier)))
             self.add_qualifier( qualifier, value )
 
@@ -302,10 +306,10 @@ def _load_definition(self, filename):
         try:
             with open(filename) as data:
                 raw = json.load( data )
-                for key, value in raw.iteritems():
+                for key, value in raw.items():
                     #logging.error("key:%s value:%s",key,value)
                     if "qualifier" in key:
-                        for item, definition in value.iteritems():
+                        for item, definition in value.items():
                             #logging.error("item:%s definition:%s",item,definition)
                             self.legal_qualifiers += [item]
                             mandatory = "mandatory" in key
@@ -335,7 +339,7 @@ def _load_feature_translations(self, filenames):
                 data = json.load( open("%s/%s" % (local_dir, filename)) )
             except IOError:
                 data = json.load( open("%s/%s" % (module_dir, filename)) )
-            for gff_feature, info in data.iteritems():
+            for gff_feature, info in data.items():
                 if info.get("remove", False):
                     self.remove += [gff_feature]
                 if "target" in info:
@@ -354,7 +358,7 @@ def _load_qualifier_translations(self, filenames):
                 data = json.load( open("%s/%s" % (local_dir, filename)) )
             except IOError:
                 data = json.load( open("%s/%s" % (module_dir, filename)) )
-            for gff_feature, info in data.iteritems():
+            for gff_feature, info in data.items():
                 if "target" in info:
                     self.qualifier_translation_list[gff_feature] = info["target"]
                 if "prefix" in info:
@@ -503,7 +507,7 @@ def combine(self, other):
         self.location += other.location
 
         # combine qualifier except codon start
-        for gff_qualifier, list_val_other in other.qualifiers.iteritems():
+        for gff_qualifier, list_val_other in other.qualifiers.items():
             other_qualifier = self._from_gff_qualifier(gff_qualifier) # get the real qualifier name in EMBL format to be able to compare with the one alredy saved
             if other_qualifier != "codon_start":
                 self.add_qualifier(gff_qualifier, list_val_other)
@@ -619,7 +623,7 @@ def translation(self):
 
     def handle_message(self, type, msg_type, msg, value):
 
-        if Feature.PREVIOUS_ERRORS.has_key(msg_type):
+        if msg_type in Feature.PREVIOUS_ERRORS:
             Feature.PREVIOUS_ERRORS[msg_type] += 1
 
         level = eval("logging.%s" % type.upper())
@@ -629,7 +633,7 @@ def handle_message(self, type, msg_type, msg, value):
         else:
             if not value:   # number of line accepted to display (defaut or given to the method)
                 value = 5
-            if not Feature.PREVIOUS_ERRORS.has_key(msg_type) or Feature.PREVIOUS_ERRORS[msg_type] < value:
+            if msg_type not in Feature.PREVIOUS_ERRORS or Feature.PREVIOUS_ERRORS[msg_type] < value:
                 logging.log(level, msg)
                 Feature.PREVIOUS_ERRORS.setdefault(msg_type,1)
             elif Feature.PREVIOUS_ERRORS[msg_type] == value:
@@ -660,11 +664,11 @@ def handle_message(self, type, msg_type, msg, value):
             break
 
         for gff_feature in record.features:
-            print gff_feature
-            print "_"*80
+            print(gff_feature)
+            print("_"*80)
             feature = Feature( gff_feature, args.translation_file, 1, feature_definition_dir = "features", qualifier_definition_dir="qualifiers" )
-            print "_"*80
-            print feature
+            print("_"*80)
+            print(feature)
             break
     except Exception as e:
         import traceback

diff --git a/EMBLmyGFF3/modules/features/parse2json.py b/EMBLmyGFF3/modules/features/parse2json.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 """
 Script to parse a "raw" text copy of http://www.insdc.org/files/feature_table.html#7.2 into
 a set of json feature identifiers.
@@ -119,10 +119,10 @@ def parse_raw_to_json(infile):
                 extension = row.strip()
                 setattr(current, identifier, base + extension)
             except Exception as e:
-                print "EXCEPTION: %s" % e
-                print "ID: '%s'" % identifier
-                print "ROW: '%s'" % row
-                print current
+                print("EXCEPTION: %s" % e)
+                print("ID: '%s'" % identifier)
+                print("ROW: '%s'" % row)
+                print(current)
                 import sys
                 sys.exit(0)
         elif current != None: