Skip to content

Commit

Permalink
Merged latest upstream changes. Mostly PEP8 conformance and move to h…
Browse files Browse the repository at this point in the history
…ttps.
  • Loading branch information
StuntsPT committed Jul 21, 2016
1 parent cd1e75d commit 00226a5
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 60 deletions.
119 changes: 68 additions & 51 deletions Entrez.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
# Copyright 2008-2013 by Michiel de Hoon. All rights reserved.
# Revisions copyright 2011-2015 by Peter Cock. All rights reserved.
# Revisions copyright 2011-2016 by Peter Cock. All rights reserved.
# Revisions copyright 2015 by Eric Rasche. All rights reserved.
# Revisions copyright 2015 by Carlos Pena. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
Expand Down Expand Up @@ -95,7 +96,6 @@

import time
import warnings
import os.path

# Importing these functions with leading underscore as not intended for reuse
from py3k import urlopen as _urlopen
Expand Down Expand Up @@ -124,7 +124,7 @@ def epost(db, **keywds):
Raises an IOError exception if there's a network error.
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
variables = {'db': db}
variables.update(keywds)
return _open(cgi, variables, post=True)
Expand Down Expand Up @@ -158,7 +158,7 @@ def efetch(db, **keywords):
**Warning:** The NCBI changed the default retmode in Feb 2012, so many
databases which previously returned text output now give XML.
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
variables = {'db': db}
variables.update(keywords)
post = False
Expand All @@ -173,6 +173,7 @@ def efetch(db, **keywords):
elif isinstance(ids, int):
ids = str(ids)
variables["id"] = ids

if ids.count(",") >= 200:
# NCBI prefers an HTTP POST instead of an HTTP GET if there are
# more than about 200 IDs
Expand Down Expand Up @@ -209,7 +210,7 @@ def esearch(db, term, **keywds):
True
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
variables = {'db': db,
'term': term}
variables.update(keywds)
Expand Down Expand Up @@ -249,7 +250,7 @@ def elink(**keywds):
This is explained in much more detail in the Biopython Tutorial.
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
variables = {}
variables.update(keywds)
return _open(cgi, variables)
Expand Down Expand Up @@ -277,7 +278,7 @@ def einfo(**keywds):
True
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
variables = {}
variables.update(keywds)
return _open(cgi, variables)
Expand Down Expand Up @@ -309,7 +310,7 @@ def esummary(**keywds):
Computational biology and chemistry
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
variables = {}
variables.update(keywds)
return _open(cgi, variables)
Expand Down Expand Up @@ -343,7 +344,7 @@ def egquery(**keywds):
True
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
variables = {}
variables.update(keywds)
return _open(cgi, variables)
Expand Down Expand Up @@ -372,12 +373,31 @@ def espell(**keywds):
biopython
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
variables = {}
variables.update(keywds)
return _open(cgi, variables)


def _update_ecitmatch_variables(keywds):
# XML is the only supported value, and it actually returns TXT.
variables = {'retmode': 'xml'}
citation_keys = ('journal_title', 'year', 'volume', 'first_page', 'author_name', 'key')

# Accept pre-formatted strings
if isinstance(keywds['bdata'], str):
variables.update(keywds)
else:
# Alternatively accept a nicer interface
variables['db'] = keywds['db']
bdata = []
for citation in keywds['bdata']:
formatted_citation = '|'.join([citation.get(key, "") for key in citation_keys])
bdata.append(formatted_citation)
variables['bdata'] = '\r'.join(bdata)
return variables


def ecitmatch(**keywds):
"""ECitMatch retrieves PMIDs-Citation linking
Expand All @@ -401,23 +421,8 @@ def ecitmatch(**keywds):
>>> record = Entrez.ecitmatch(db="pubmed", bdata=[citation_1])
>>> print(record["Query"])
"""
cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi'
# XML is the only supported value, and it actually returns TXT.
variables = {'retmode': 'xml'}
citation_keys = ('journal_title', 'year', 'volume', 'first_page', 'author_name', 'key')

# Accept pre-formatted strings
if isinstance(keywds['bdata'], str):
variables.update(keywds)
else:
# Alternatively accept a nicer interface
variables['db'] = keywds['db']
bdata = []
for citation in keywds['bdata']:
formatted_citation = '|'.join([citation.get(key, "") for key in citation_keys])
bdata.append(formatted_citation)
variables['bdata'] = '\r'.join(bdata)

cgi = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi'
variables = _update_ecitmatch_variables(keywds)
return _open(cgi, variables, ecitmatch=True)


Expand Down Expand Up @@ -488,17 +493,12 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
The arugment post should be a boolean to explicitly control if an HTTP
POST should be used rather an HTTP GET based on the query length.
By default (post=None), POST is used if the query URL would be over
1000 characters long.
The arugment post should be a boolean to explicitly control if an HTTP
POST should be used rather an HTTP GET based on the query length.
By default (post=None), POST is used if the URL encoded paramters would
be over 1000 characters long.
This function also enforces the "up to three queries per second rule"
to avoid abusing the NCBI servers.
"""
if params is None:
params = {}
# NCBI requirement: At most three queries per second.
# Equivalently, at least a third of second between queries
delay = 0.333333334
Expand All @@ -509,6 +509,31 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
_open.previous = current + wait
else:
_open.previous = current

params = _construct_params(params)
options = _encode_options(ecitmatch, params)

# By default, post is None. Set to a boolean to over-ride length choice:
if post is None and len(options) > 1000:
post = True
cgi = _construct_cgi(cgi, post, options)

try:
if post:
handle = _urlopen(cgi, data=_as_bytes(options))
else:
handle = _urlopen(cgi)
except _HTTPError as exception:
raise exception

return _binary_to_string_handle(handle)
_open.previous = 0


def _construct_params(params):
if params is None:
params = {}

# Remove None values from the parameters
for key, value in list(params.items()):
if value is None:
Expand All @@ -533,31 +558,23 @@ def _open(cgi, params=None, post=None, ecitmatch=False):
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)
return params


def _encode_options(ecitmatch, params):
# Open a handle to Entrez.
options = _urlencode(params, doseq=True)
# _urlencode encodes pipes, which NCBI expects in ECitMatch
if ecitmatch:
options = options.replace('%7C', '|')
# print cgi + "?" + options

# By default, post is None. Set to a boolean to over-ride length choice:
if post is None and len(options) > 1000:
post = True
try:
if post:
# HTTP POST
handle = _urlopen(cgi, data=_as_bytes(options))
else:
# HTTP GET
cgi += "?" + options
handle = _urlopen(cgi)
except _HTTPError as exception:
raise exception
return options

return _binary_to_string_handle(handle)

_open.previous = 0
def _construct_cgi(cgi, post, options):
if not post:
# HTTP GET
cgi += "?" + options
return cgi


def _test():
Expand Down
24 changes: 15 additions & 9 deletions Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
XML file returned by the Entrez Utilities. This is preferred over a hand-
written solution, since the number of DTDs is rather large and their
contents may change over time. About half the code in this parser deals
wih parsing the DTD, and the other half with the XML itself.
with parsing the DTD, and the other half with the XML itself.
"""
import sys
import re
Expand All @@ -48,7 +48,6 @@
from py3k import urlparse as _urlparse
from py3k import unicode

__docformat__ = "restructuredtext en"

# The following four classes are used to add a member .attributes to integers,
# strings, lists, and dictionaries, respectively.
Expand Down Expand Up @@ -146,12 +145,20 @@ def __str__(self):


class ValidationError(ValueError):
"""Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
"""XML tag found which was not defined in the DTD.
Validating parsers raise this error if the parser finds a tag in the XML
that is not defined in the DTD. Non-validating parsers do not raise this
error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating
parsers by default (see those functions for more information).
"""
def __init__(self, name):
self.name = name

def __str__(self):
return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
return ("Failed to find tag '%s' in the DTD. To skip all tags that "
"are not represented in the DTD, please call Bio.Entrez.read "
"or Bio.Entrez.parse with validate=False." % self.name)


class DataHandler(object):
Expand Down Expand Up @@ -185,7 +192,6 @@ class DataHandler(object):

global_dtd_dir = os.path.join(str(os.path.realpath(__file__)), "DTDs")
global_xsd_dir = os.path.join(str(os.path.realpath(__file__)), "XSDs")

del Entrez

def __init__(self, validate):
Expand Down Expand Up @@ -464,10 +470,10 @@ def elementDecl(self, name, model):
return
# First, remove ignorable parentheses around declarations
while (model[0] in (expat.model.XML_CTYPE_SEQ,
expat.model.XML_CTYPE_CHOICE)
and model[1] in (expat.model.XML_CQUANT_NONE,
expat.model.XML_CQUANT_OPT)
and len(model[3]) == 1):
expat.model.XML_CTYPE_CHOICE) and
model[1] in (expat.model.XML_CQUANT_NONE,
expat.model.XML_CQUANT_OPT) and
len(model[3]) == 1):
model = model[3][0]
# PCDATA declarations correspond to strings
if model[0] in (expat.model.XML_CTYPE_MIXED,
Expand Down

0 comments on commit 00226a5

Please sign in to comment.