Merged latest upstream changes. Mostly PEP8 conformance and move to h…

…ttps.
StuntsPT · Jul 21, 2016 · 00226a5 · 00226a5
1 parent cd1e75d
commit 00226a5
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 60 deletions.
diff --git a/Entrez.py b/Entrez.py
@@ -1,7 +1,8 @@
 # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
 # Copyright 2008-2013 by Michiel de Hoon.  All rights reserved.
-# Revisions copyright 2011-2015 by Peter Cock. All rights reserved.
+# Revisions copyright 2011-2016 by Peter Cock. All rights reserved.
 # Revisions copyright 2015 by Eric Rasche. All rights reserved.
+# Revisions copyright 2015 by Carlos Pena. All rights reserved.
 # This code is part of the Biopython distribution and governed by its
 # license.  Please see the LICENSE file that should have been included
 # as part of this package.
@@ -95,7 +96,6 @@
 
 import time
 import warnings
-import os.path
 
 # Importing these functions with leading underscore as not intended for reuse
 from py3k import urlopen as _urlopen
@@ -124,7 +124,7 @@ def epost(db, **keywds):
 
     Raises an IOError exception if there's a network error.
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
     variables = {'db': db}
     variables.update(keywds)
     return _open(cgi, variables, post=True)
@@ -158,7 +158,7 @@ def efetch(db, **keywords):
     **Warning:** The NCBI changed the default retmode in Feb 2012, so many
     databases which previously returned text output now give XML.
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
     variables = {'db': db}
     variables.update(keywords)
     post = False
@@ -173,6 +173,7 @@ def efetch(db, **keywords):
         elif isinstance(ids, int):
             ids = str(ids)
             variables["id"] = ids
+
         if ids.count(",") >= 200:
             # NCBI prefers an HTTP POST instead of an HTTP GET if there are
             # more than about 200 IDs
@@ -209,7 +210,7 @@ def esearch(db, term, **keywds):
     True
 
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
     variables = {'db': db,
                  'term': term}
     variables.update(keywds)
@@ -249,7 +250,7 @@ def elink(**keywds):
 
     This is explained in much more detail in the Biopython Tutorial.
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
     variables = {}
     variables.update(keywds)
     return _open(cgi, variables)
@@ -277,7 +278,7 @@ def einfo(**keywds):
     True
 
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
     variables = {}
     variables.update(keywds)
     return _open(cgi, variables)
@@ -309,7 +310,7 @@ def esummary(**keywds):
     Computational biology and chemistry
 
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
     variables = {}
     variables.update(keywds)
     return _open(cgi, variables)
@@ -343,7 +344,7 @@ def egquery(**keywds):
     True
 
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
     variables = {}
     variables.update(keywds)
     return _open(cgi, variables)
@@ -372,12 +373,31 @@ def espell(**keywds):
     biopython
 
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
     variables = {}
     variables.update(keywds)
     return _open(cgi, variables)
 
 
+def _update_ecitmatch_variables(keywds):
+    # XML is the only supported value, and it actually returns TXT.
+    variables = {'retmode': 'xml'}
+    citation_keys = ('journal_title', 'year', 'volume', 'first_page', 'author_name', 'key')
+
+    # Accept pre-formatted strings
+    if isinstance(keywds['bdata'], str):
+        variables.update(keywds)
+    else:
+        # Alternatively accept a nicer interface
+        variables['db'] = keywds['db']
+        bdata = []
+        for citation in keywds['bdata']:
+            formatted_citation = '|'.join([citation.get(key, "") for key in citation_keys])
+            bdata.append(formatted_citation)
+        variables['bdata'] = '\r'.join(bdata)
+    return variables
+
+
 def ecitmatch(**keywds):
     """ECitMatch retrieves PMIDs-Citation linking
 
@@ -401,23 +421,8 @@ def ecitmatch(**keywds):
     >>> record = Entrez.ecitmatch(db="pubmed", bdata=[citation_1])
     >>> print(record["Query"])
     """
-    cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi'
-    # XML is the only supported value, and it actually returns TXT.
-    variables = {'retmode': 'xml'}
-    citation_keys = ('journal_title', 'year', 'volume', 'first_page', 'author_name', 'key')
-
-    # Accept pre-formatted strings
-    if isinstance(keywds['bdata'], str):
-        variables.update(keywds)
-    else:
-        # Alternatively accept a nicer interface
-        variables['db'] = keywds['db']
-        bdata = []
-        for citation in keywds['bdata']:
-            formatted_citation = '|'.join([citation.get(key, "") for key in citation_keys])
-            bdata.append(formatted_citation)
-        variables['bdata'] = '\r'.join(bdata)
-
+    cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi'
+    variables = _update_ecitmatch_variables(keywds)
     return _open(cgi, variables, ecitmatch=True)
 
 
@@ -488,17 +493,12 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
 
     The arugment post should be a boolean to explicitly control if an HTTP
     POST should be used rather an HTTP GET based on the query length.
-    By default (post=None), POST is used if the query URL would be over
-    1000 characters long.
-
-    The arugment post should be a boolean to explicitly control if an HTTP
-    POST should be used rather an HTTP GET based on the query length.
+    By default (post=None), POST is used if the URL encoded paramters would
+    be over 1000 characters long.
 
     This function also enforces the "up to three queries per second rule"
     to avoid abusing the NCBI servers.
     """
-    if params is None:
-        params = {}
     # NCBI requirement: At most three queries per second.
     # Equivalently, at least a third of second between queries
     delay = 0.333333334
@@ -509,6 +509,31 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
         _open.previous = current + wait
     else:
         _open.previous = current
+
+    params = _construct_params(params)
+    options = _encode_options(ecitmatch, params)
+
+    # By default, post is None. Set to a boolean to over-ride length choice:
+    if post is None and len(options) > 1000:
+        post = True
+    cgi = _construct_cgi(cgi, post, options)
+
+    try:
+        if post:
+            handle = _urlopen(cgi, data=_as_bytes(options))
+        else:
+            handle = _urlopen(cgi)
+    except _HTTPError as exception:
+        raise exception
+
+    return _binary_to_string_handle(handle)
+_open.previous = 0
+
+
+def _construct_params(params):
+    if params is None:
+        params = {}
+
     # Remove None values from the parameters
     for key, value in list(params.items()):
         if value is None:
@@ -533,31 +558,23 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
 In case of excessive usage of the E-utilities, NCBI will attempt to contact
 a user at the email address provided before blocking access to the
 E-utilities.""", UserWarning)
+    return params
+
 
+def _encode_options(ecitmatch, params):
     # Open a handle to Entrez.
     options = _urlencode(params, doseq=True)
     # _urlencode encodes pipes, which NCBI expects in ECitMatch
     if ecitmatch:
         options = options.replace('%7C', '|')
-    # print cgi + "?" + options
-
-    # By default, post is None. Set to a boolean to over-ride length choice:
-    if post is None and len(options) > 1000:
-        post = True
-    try:
-        if post:
-            # HTTP POST
-            handle = _urlopen(cgi, data=_as_bytes(options))
-        else:
-            # HTTP GET
-            cgi += "?" + options
-            handle = _urlopen(cgi)
-    except _HTTPError as exception:
-        raise exception
+    return options
 
-    return _binary_to_string_handle(handle)
 
-_open.previous = 0
+def _construct_cgi(cgi, post, options):
+    if not post:
+        # HTTP GET
+        cgi += "?" + options
+    return cgi
 
 
 def _test():

diff --git a/Parser.py b/Parser.py
@@ -33,7 +33,7 @@
 XML file returned by the Entrez Utilities. This is preferred over a hand-
 written solution, since the number of DTDs is rather large and their
 contents may change over time. About half the code in this parser deals
-wih parsing the DTD, and the other half with the XML itself.
+with parsing the DTD, and the other half with the XML itself.
 """
 import sys
 import re
@@ -48,7 +48,6 @@
 from py3k import urlparse as _urlparse
 from py3k import unicode
 
-__docformat__ = "restructuredtext en"
 
 # The following four classes are used to add a member .attributes to integers,
 # strings, lists, and dictionaries, respectively.
@@ -146,12 +145,20 @@ def __str__(self):
 
 
 class ValidationError(ValueError):
-    """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
+    """XML tag found which was not defined in the DTD.
+
+    Validating parsers raise this error if the parser finds a tag in the XML
+    that is not defined in the DTD. Non-validating parsers do not raise this
+    error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating
+    parsers by default (see those functions for more information).
+    """
     def __init__(self, name):
         self.name = name
 
     def __str__(self):
-        return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
+        return ("Failed to find tag '%s' in the DTD. To skip all tags that "
+                "are not represented in the DTD, please call Bio.Entrez.read "
+                "or Bio.Entrez.parse with validate=False." % self.name)
 
 
 class DataHandler(object):
@@ -185,7 +192,6 @@ class DataHandler(object):
 
     global_dtd_dir = os.path.join(str(os.path.realpath(__file__)), "DTDs")
     global_xsd_dir = os.path.join(str(os.path.realpath(__file__)), "XSDs")
-
     del Entrez
 
     def __init__(self, validate):
@@ -464,10 +470,10 @@ def elementDecl(self, name, model):
             return
         # First, remove ignorable parentheses around declarations
         while (model[0] in (expat.model.XML_CTYPE_SEQ,
-                            expat.model.XML_CTYPE_CHOICE)
-          and model[1] in (expat.model.XML_CQUANT_NONE,
-                           expat.model.XML_CQUANT_OPT)
-          and len(model[3]) == 1):
+                            expat.model.XML_CTYPE_CHOICE) and
+               model[1] in (expat.model.XML_CQUANT_NONE,
+                           expat.model.XML_CQUANT_OPT) and
+               len(model[3]) == 1):
             model = model[3][0]
         # PCDATA declarations correspond to strings
         if model[0] in (expat.model.XML_CTYPE_MIXED,