diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index 4a7b1b5..e6e974e 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -1,4 +1,6 @@ """.. include:: ../README.md""" +from __future__ import annotations + import logging import time import feedparser @@ -35,7 +37,7 @@ class Result(object): """When the result was originally published.""" title: str """The title of the result.""" - authors: list + authors: List[Author] """The result's authors.""" summary: str """The result abstract.""" @@ -55,7 +57,7 @@ class Result(object): All of the result's categories. See [arXiv: Category Taxonomy](https://arxiv.org/category_taxonomy). """ - links: list + links: List[Link] """Up to three URLs associated with this result.""" pdf_url: str """The URL of a PDF version of this result if present among links.""" @@ -71,14 +73,14 @@ def __init__( updated: datetime = _DEFAULT_TIME, published: datetime = _DEFAULT_TIME, title: str = "", - authors: List['Result.Author'] = [], + authors: List[Author] = [], summary: str = "", comment: str = "", journal_ref: str = "", doi: str = "", primary_category: str = "", categories: List[str] = [], - links: List['Result.Link'] = [], + links: List[Link] = [], _raw: feedparser.FeedParserDict = None, ): """ @@ -104,7 +106,7 @@ def __init__( # Debugging self._raw = _raw - def _from_feed_entry(entry: feedparser.FeedParserDict) -> 'Result': + def _from_feed_entry(entry: feedparser.FeedParserDict) -> Result: """ Converts a feedparser entry for an arXiv search result feed into a Result object. @@ -221,7 +223,7 @@ def download_source(self, dirpath: str = './', filename: str = '') -> str: written_path, _ = urlretrieve(source_url, path) return written_path - def _get_pdf_url(links: list) -> str: + def _get_pdf_url(links: List[Link]) -> str: """ Finds the PDF link among a result's links and returns its URL. @@ -266,7 +268,7 @@ def __init__(self, name: str): def _from_feed_author( feed_author: feedparser.FeedParserDict - ) -> 'Result.Author': + ) -> Result.Author: """ Constructs an `Author` with the name specified in an author object from a feed entry. @@ -320,7 +322,7 @@ def __init__( def _from_feed_link( feed_link: feedparser.FeedParserDict - ) -> 'Result.Link': + ) -> Result.Link: """ Constructs a `Link` with link metadata specified in a link object from a feed entry. @@ -416,7 +418,7 @@ class Search(object): See [the arXiv API User's Manual: Details of Query Construction](https://arxiv.org/help/api/user-manual#query_details). """ - id_list: list + id_list: List[str] """ A list of arXiv article IDs to which to limit the search. diff --git a/docs/index.html b/docs/index.html index 476f7b5..3b25665 100644 --- a/docs/index.html +++ b/docs/index.html @@ -491,801 +491,803 @@
1""".. include:: ../README.md""" - 2import logging - 3import time - 4import feedparser - 5import re - 6import os - 7import warnings - 8 - 9from urllib.parse import urlencode - 10from urllib.request import urlretrieve - 11from datetime import datetime, timedelta, timezone - 12from calendar import timegm - 13 - 14from enum import Enum - 15from typing import Dict, Generator, List - 16 - 17logger = logging.getLogger(__name__) + 2from __future__ import annotations + 3 + 4import logging + 5import time + 6import feedparser + 7import re + 8import os + 9import warnings + 10 + 11from urllib.parse import urlencode + 12from urllib.request import urlretrieve + 13from datetime import datetime, timedelta, timezone + 14from calendar import timegm + 15 + 16from enum import Enum + 17from typing import Dict, Generator, List 18 - 19_DEFAULT_TIME = datetime.min + 19logger = logging.getLogger(__name__) 20 - 21 - 22class Result(object): - 23 """ - 24 An entry in an arXiv query results feed. - 25 - 26 See [the arXiv API User's Manual: Details of Atom Results - 27 Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). - 28 """ - 29 - 30 entry_id: str - 31 """A url of the form `http://arxiv.org/abs/{id}`.""" - 32 updated: datetime - 33 """When the result was last updated.""" - 34 published: datetime - 35 """When the result was originally published.""" - 36 title: str - 37 """The title of the result.""" - 38 authors: list - 39 """The result's authors.""" - 40 summary: str - 41 """The result abstract.""" - 42 comment: str - 43 """The authors' comment if present.""" - 44 journal_ref: str - 45 """A journal reference if present.""" - 46 doi: str - 47 """A URL for the resolved DOI to an external resource if present.""" - 48 primary_category: str - 49 """ - 50 The result's primary arXiv category. See [arXiv: Category - 51 Taxonomy](https://arxiv.org/category_taxonomy). - 52 """ - 53 categories: List[str] - 54 """ - 55 All of the result's categories. See [arXiv: Category - 56 Taxonomy](https://arxiv.org/category_taxonomy). - 57 """ - 58 links: list - 59 """Up to three URLs associated with this result.""" - 60 pdf_url: str - 61 """The URL of a PDF version of this result if present among links.""" - 62 _raw: feedparser.FeedParserDict - 63 """ - 64 The raw feedparser result object if this Result was constructed with - 65 Result._from_feed_entry. - 66 """ - 67 - 68 def __init__( - 69 self, - 70 entry_id: str, - 71 updated: datetime = _DEFAULT_TIME, - 72 published: datetime = _DEFAULT_TIME, - 73 title: str = "", - 74 authors: List['Result.Author'] = [], - 75 summary: str = "", - 76 comment: str = "", - 77 journal_ref: str = "", - 78 doi: str = "", - 79 primary_category: str = "", - 80 categories: List[str] = [], - 81 links: List['Result.Link'] = [], - 82 _raw: feedparser.FeedParserDict = None, - 83 ): - 84 """ - 85 Constructs an arXiv search result item. - 86 - 87 In most cases, prefer using `Result._from_feed_entry` to parsing and - 88 constructing `Result`s yourself. - 89 """ - 90 self.entry_id = entry_id - 91 self.updated = updated - 92 self.published = published - 93 self.title = title - 94 self.authors = authors - 95 self.summary = summary - 96 self.comment = comment - 97 self.journal_ref = journal_ref - 98 self.doi = doi - 99 self.primary_category = primary_category -100 self.categories = categories -101 self.links = links -102 # Calculated members -103 self.pdf_url = Result._get_pdf_url(links) -104 # Debugging -105 self._raw = _raw -106 -107 def _from_feed_entry(entry: feedparser.FeedParserDict) -> 'Result': -108 """ -109 Converts a feedparser entry for an arXiv search result feed into a -110 Result object. -111 """ -112 if not hasattr(entry, "id"): -113 raise Result.MissingFieldError("id") -114 # Title attribute may be absent for certain titles. Defaulting to "0" as -115 # it's the only title observed to cause this bug. -116 # https://github.com/lukasschwab/arxiv.py/issues/71 -117 # title = entry.title if hasattr(entry, "title") else "0" -118 title = "0" -119 if hasattr(entry, "title"): -120 title = entry.title -121 else: -122 logger.warning( -123 "Result %s is missing title attribute; defaulting to '0'", -124 entry.id -125 ) -126 return Result( -127 entry_id=entry.id, -128 updated=Result._to_datetime(entry.updated_parsed), -129 published=Result._to_datetime(entry.published_parsed), -130 title=re.sub(r'\s+', ' ', title), -131 authors=[Result.Author._from_feed_author(a) for a in entry.authors], -132 summary=entry.summary, -133 comment=entry.get('arxiv_comment'), -134 journal_ref=entry.get('arxiv_journal_ref'), -135 doi=entry.get('arxiv_doi'), -136 primary_category=entry.arxiv_primary_category.get('term'), -137 categories=[tag.get('term') for tag in entry.tags], -138 links=[Result.Link._from_feed_link(link) for link in entry.links], -139 _raw=entry -140 ) -141 -142 def __str__(self) -> str: -143 return self.entry_id -144 -145 def __repr__(self) -> str: -146 return ( -147 '{}(entry_id={}, updated={}, published={}, title={}, authors={}, ' -148 'summary={}, comment={}, journal_ref={}, doi={}, ' -149 'primary_category={}, categories={}, links={})' -150 ).format( -151 _classname(self), -152 repr(self.entry_id), -153 repr(self.updated), -154 repr(self.published), -155 repr(self.title), -156 repr(self.authors), -157 repr(self.summary), -158 repr(self.comment), -159 repr(self.journal_ref), -160 repr(self.doi), -161 repr(self.primary_category), -162 repr(self.categories), -163 repr(self.links) -164 ) -165 -166 def __eq__(self, other) -> bool: -167 if isinstance(other, Result): -168 return self.entry_id == other.entry_id -169 return False -170 -171 def get_short_id(self) -> str: -172 """ -173 Returns the short ID for this result. -174 -175 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, -176 `result.get_short_id()` returns `2107.05580v1`. -177 -178 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, -179 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March -180 2007 arXiv identifier format). -181 -182 For an explanation of the difference between arXiv's legacy and current -183 identifiers, see [Understanding the arXiv -184 identifier](https://arxiv.org/help/arxiv_identifier). -185 """ -186 return self.entry_id.split('arxiv.org/abs/')[-1] -187 -188 def _get_default_filename(self, extension: str = "pdf") -> str: -189 """ -190 A default `to_filename` function for the extension given. -191 """ -192 nonempty_title = self.title if self.title else "UNTITLED" -193 # Remove disallowed characters. -194 clean_title = '_'.join(re.findall(r'\w+', nonempty_title)) -195 return "{}.{}.{}".format(self.get_short_id(), clean_title, extension) -196 -197 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: -198 """ -199 Downloads the PDF for this result to the specified directory. -200 -201 The filename is generated by calling `to_filename(self)`. -202 """ -203 if not filename: -204 filename = self._get_default_filename() -205 path = os.path.join(dirpath, filename) -206 written_path, _ = urlretrieve(self.pdf_url, path) -207 return written_path -208 -209 def download_source(self, dirpath: str = './', filename: str = '') -> str: -210 """ -211 Downloads the source tarfile for this result to the specified -212 directory. -213 -214 The filename is generated by calling `to_filename(self)`. -215 """ -216 if not filename: -217 filename = self._get_default_filename('tar.gz') -218 path = os.path.join(dirpath, filename) -219 # Bodge: construct the source URL from the PDF URL. -220 source_url = self.pdf_url.replace('/pdf/', '/src/') -221 written_path, _ = urlretrieve(source_url, path) -222 return written_path -223 -224 def _get_pdf_url(links: list) -> str: -225 """ -226 Finds the PDF link among a result's links and returns its URL. -227 -228 Should only be called once for a given `Result`, in its constructor. -229 After construction, the URL should be available in `Result.pdf_url`. -230 """ -231 pdf_urls = [link.href for link in links if link.title == 'pdf'] -232 if len(pdf_urls) == 0: -233 return None -234 elif len(pdf_urls) > 1: -235 logger.warning( -236 "Result has multiple PDF links; using %s", -237 pdf_urls[0] -238 ) -239 return pdf_urls[0] -240 -241 def _to_datetime(ts: time.struct_time) -> datetime: -242 """ -243 Converts a UTC time.struct_time into a time-zone-aware datetime. -244 -245 This will be replaced with feedparser functionality [when it becomes -246 available](https://github.com/kurtmckee/feedparser/issues/212). -247 """ -248 return datetime.fromtimestamp(timegm(ts), tz=timezone.utc) -249 -250 class Author(object): -251 """ -252 A light inner class for representing a result's authors. -253 """ -254 -255 name: str -256 """The author's name.""" -257 -258 def __init__(self, name: str): -259 """ -260 Constructs an `Author` with the specified name. -261 -262 In most cases, prefer using `Author._from_feed_author` to parsing -263 and constructing `Author`s yourself. -264 """ -265 self.name = name -266 -267 def _from_feed_author( -268 feed_author: feedparser.FeedParserDict -269 ) -> 'Result.Author': -270 """ -271 Constructs an `Author` with the name specified in an author object -272 from a feed entry. -273 -274 See usage in `Result._from_feed_entry`. -275 """ -276 return Result.Author(feed_author.name) -277 -278 def __str__(self) -> str: -279 return self.name -280 -281 def __repr__(self) -> str: -282 return '{}({})'.format(_classname(self), repr(self.name)) -283 -284 def __eq__(self, other) -> bool: -285 if isinstance(other, Result.Author): -286 return self.name == other.name -287 return False -288 -289 class Link(object): -290 """ -291 A light inner class for representing a result's links. -292 """ -293 -294 href: str -295 """The link's `href` attribute.""" -296 title: str -297 """The link's title.""" -298 rel: str -299 """The link's relationship to the `Result`.""" -300 content_type: str -301 """The link's HTTP content type.""" -302 -303 def __init__( -304 self, -305 href: str, -306 title: str = None, -307 rel: str = None, -308 content_type: str = None -309 ): -310 """ -311 Constructs a `Link` with the specified link metadata. -312 -313 In most cases, prefer using `Link._from_feed_link` to parsing and -314 constructing `Link`s yourself. -315 """ -316 self.href = href -317 self.title = title -318 self.rel = rel -319 self.content_type = content_type -320 -321 def _from_feed_link( -322 feed_link: feedparser.FeedParserDict -323 ) -> 'Result.Link': -324 """ -325 Constructs a `Link` with link metadata specified in a link object -326 from a feed entry. -327 -328 See usage in `Result._from_feed_entry`. -329 """ -330 return Result.Link( -331 href=feed_link.href, -332 title=feed_link.get('title'), -333 rel=feed_link.get('rel'), -334 content_type=feed_link.get('content_type') -335 ) -336 -337 def __str__(self) -> str: -338 return self.href -339 -340 def __repr__(self) -> str: -341 return '{}({}, title={}, rel={}, content_type={})'.format( -342 _classname(self), -343 repr(self.href), -344 repr(self.title), -345 repr(self.rel), -346 repr(self.content_type) -347 ) -348 -349 def __eq__(self, other) -> bool: -350 if isinstance(other, Result.Link): -351 return self.href == other.href -352 return False -353 -354 class MissingFieldError(Exception): -355 """ -356 An error indicating an entry is unparseable because it lacks required -357 fields. -358 """ -359 -360 missing_field: str -361 """The required field missing from the would-be entry.""" -362 message: str -363 """Message describing what caused this error.""" -364 -365 def __init__(self, missing_field): -366 self.missing_field = missing_field -367 self.message = "Entry from arXiv missing required info" -368 -369 def __repr__(self) -> str: -370 return '{}({})'.format( -371 _classname(self), -372 repr(self.missing_field) -373 ) -374 -375 -376class SortCriterion(Enum): -377 """ -378 A SortCriterion identifies a property by which search results can be -379 sorted. -380 -381 See [the arXiv API User's Manual: sort order for return -382 results](https://arxiv.org/help/api/user-manual#sort). -383 """ -384 Relevance = "relevance" -385 LastUpdatedDate = "lastUpdatedDate" -386 SubmittedDate = "submittedDate" -387 -388 -389class SortOrder(Enum): -390 """ -391 A SortOrder indicates order in which search results are sorted according -392 to the specified arxiv.SortCriterion. -393 -394 See [the arXiv API User's Manual: sort order for return -395 results](https://arxiv.org/help/api/user-manual#sort). -396 """ -397 Ascending = "ascending" -398 Descending = "descending" -399 -400 -401class Search(object): -402 """ -403 A specification for a search of arXiv's database. -404 -405 To run a search, use `Search.run` to use a default client or `Client.run` -406 with a specific client. -407 """ -408 -409 query: str -410 """ -411 A query string. -412 -413 This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not -414 `au:del_maestro+AND+ti:checkerboard`. -415 -416 See [the arXiv API User's Manual: Details of Query -417 Construction](https://arxiv.org/help/api/user-manual#query_details). -418 """ -419 id_list: list -420 """ -421 A list of arXiv article IDs to which to limit the search. -422 -423 See [the arXiv API User's -424 Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) -425 for documentation of the interaction between `query` and `id_list`. -426 """ -427 max_results: float -428 """ -429 The maximum number of results to be returned in an execution of this -430 search. -431 -432 To fetch every result available, set `max_results=float('inf')`. -433 """ -434 sort_by: SortCriterion -435 """The sort criterion for results.""" -436 sort_order: SortOrder -437 """The sort order for results.""" -438 -439 def __init__( -440 self, -441 query: str = "", -442 id_list: List[str] = [], -443 max_results: float = float('inf'), -444 sort_by: SortCriterion = SortCriterion.Relevance, -445 sort_order: SortOrder = SortOrder.Descending -446 ): -447 """ -448 Constructs an arXiv API search with the specified criteria. -449 """ -450 self.query = query -451 self.id_list = id_list -452 self.max_results = max_results -453 self.sort_by = sort_by -454 self.sort_order = sort_order -455 -456 def __str__(self) -> str: -457 # TODO: develop a more informative string representation. -458 return repr(self) -459 -460 def __repr__(self) -> str: -461 return ( -462 '{}(query={}, id_list={}, max_results={}, sort_by={}, ' -463 'sort_order={})' -464 ).format( -465 _classname(self), -466 repr(self.query), -467 repr(self.id_list), -468 repr(self.max_results), -469 repr(self.sort_by), -470 repr(self.sort_order) -471 ) -472 -473 def _url_args(self) -> Dict[str, str]: -474 """ -475 Returns a dict of search parameters that should be included in an API -476 request for this search. -477 """ -478 return { -479 "search_query": self.query, -480 "id_list": ','.join(self.id_list), -481 "sortBy": self.sort_by.value, -482 "sortOrder": self.sort_order.value -483 } -484 -485 def get(self) -> Generator[Result, None, None]: -486 """ -487 **Deprecated** after 1.2.0; use `Search.results`. -488 """ -489 warnings.warn( -490 "The 'get' method is deprecated, use 'results' instead", -491 DeprecationWarning, -492 stacklevel=2 -493 ) -494 return self.results() -495 -496 def results(self, offset: int = 0) -> Generator[Result, None, None]: -497 """ -498 Executes the specified search using a default arXiv API client. -499 -500 For info on default behavior, see `Client.__init__` and `Client.results`. -501 """ -502 return Client().results(self, offset=offset) -503 -504 -505class Client(object): -506 """ -507 Specifies a strategy for fetching results from arXiv's API. -508 -509 This class obscures pagination and retry logic, and exposes -510 `Client.results`. -511 """ -512 -513 query_url_format = 'http://export.arxiv.org/api/query?{}' -514 """The arXiv query API endpoint format.""" -515 page_size: int -516 """Maximum number of results fetched in a single API request.""" -517 delay_seconds: int -518 """Number of seconds to wait between API requests.""" -519 num_retries: int -520 """Number of times to retry a failing API request.""" -521 _last_request_dt: datetime -522 -523 def __init__( -524 self, -525 page_size: int = 100, -526 delay_seconds: int = 3, -527 num_retries: int = 3 -528 ): -529 """ -530 Constructs an arXiv API client with the specified options. -531 -532 Note: the default parameters should provide a robust request strategy -533 for most use cases. Extreme page sizes, delays, or retries risk -534 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), -535 brittle behavior, and inconsistent results. -536 """ -537 self.page_size = page_size -538 self.delay_seconds = delay_seconds -539 self.num_retries = num_retries -540 self._last_request_dt = None -541 -542 def __str__(self) -> str: -543 # TODO: develop a more informative string representation. -544 return repr(self) -545 -546 def __repr__(self) -> str: -547 return '{}(page_size={}, delay_seconds={}, num_retries={})'.format( -548 _classname(self), -549 repr(self.page_size), -550 repr(self.delay_seconds), -551 repr(self.num_retries) -552 ) -553 -554 def get(self, search: Search) -> Generator[Result, None, None]: -555 """ -556 **Deprecated** after 1.2.0; use `Client.results`. -557 """ -558 warnings.warn( -559 "The 'get' method is deprecated, use 'results' instead", -560 DeprecationWarning, -561 stacklevel=2 -562 ) -563 return self.results(search) -564 -565 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: -566 """ -567 Uses this client configuration to fetch one page of the search results -568 at a time, yielding the parsed `Result`s, until `max_results` results -569 have been yielded or there are no more search results. -570 -571 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. + 21_DEFAULT_TIME = datetime.min + 22 + 23 + 24class Result(object): + 25 """ + 26 An entry in an arXiv query results feed. + 27 + 28 See [the arXiv API User's Manual: Details of Atom Results + 29 Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). + 30 """ + 31 + 32 entry_id: str + 33 """A url of the form `http://arxiv.org/abs/{id}`.""" + 34 updated: datetime + 35 """When the result was last updated.""" + 36 published: datetime + 37 """When the result was originally published.""" + 38 title: str + 39 """The title of the result.""" + 40 authors: List[Author] + 41 """The result's authors.""" + 42 summary: str + 43 """The result abstract.""" + 44 comment: str + 45 """The authors' comment if present.""" + 46 journal_ref: str + 47 """A journal reference if present.""" + 48 doi: str + 49 """A URL for the resolved DOI to an external resource if present.""" + 50 primary_category: str + 51 """ + 52 The result's primary arXiv category. See [arXiv: Category + 53 Taxonomy](https://arxiv.org/category_taxonomy). + 54 """ + 55 categories: List[str] + 56 """ + 57 All of the result's categories. See [arXiv: Category + 58 Taxonomy](https://arxiv.org/category_taxonomy). + 59 """ + 60 links: List[Link] + 61 """Up to three URLs associated with this result.""" + 62 pdf_url: str + 63 """The URL of a PDF version of this result if present among links.""" + 64 _raw: feedparser.FeedParserDict + 65 """ + 66 The raw feedparser result object if this Result was constructed with + 67 Result._from_feed_entry. + 68 """ + 69 + 70 def __init__( + 71 self, + 72 entry_id: str, + 73 updated: datetime = _DEFAULT_TIME, + 74 published: datetime = _DEFAULT_TIME, + 75 title: str = "", + 76 authors: List[Author] = [], + 77 summary: str = "", + 78 comment: str = "", + 79 journal_ref: str = "", + 80 doi: str = "", + 81 primary_category: str = "", + 82 categories: List[str] = [], + 83 links: List[Link] = [], + 84 _raw: feedparser.FeedParserDict = None, + 85 ): + 86 """ + 87 Constructs an arXiv search result item. + 88 + 89 In most cases, prefer using `Result._from_feed_entry` to parsing and + 90 constructing `Result`s yourself. + 91 """ + 92 self.entry_id = entry_id + 93 self.updated = updated + 94 self.published = published + 95 self.title = title + 96 self.authors = authors + 97 self.summary = summary + 98 self.comment = comment + 99 self.journal_ref = journal_ref +100 self.doi = doi +101 self.primary_category = primary_category +102 self.categories = categories +103 self.links = links +104 # Calculated members +105 self.pdf_url = Result._get_pdf_url(links) +106 # Debugging +107 self._raw = _raw +108 +109 def _from_feed_entry(entry: feedparser.FeedParserDict) -> Result: +110 """ +111 Converts a feedparser entry for an arXiv search result feed into a +112 Result object. +113 """ +114 if not hasattr(entry, "id"): +115 raise Result.MissingFieldError("id") +116 # Title attribute may be absent for certain titles. Defaulting to "0" as +117 # it's the only title observed to cause this bug. +118 # https://github.com/lukasschwab/arxiv.py/issues/71 +119 # title = entry.title if hasattr(entry, "title") else "0" +120 title = "0" +121 if hasattr(entry, "title"): +122 title = entry.title +123 else: +124 logger.warning( +125 "Result %s is missing title attribute; defaulting to '0'", +126 entry.id +127 ) +128 return Result( +129 entry_id=entry.id, +130 updated=Result._to_datetime(entry.updated_parsed), +131 published=Result._to_datetime(entry.published_parsed), +132 title=re.sub(r'\s+', ' ', title), +133 authors=[Result.Author._from_feed_author(a) for a in entry.authors], +134 summary=entry.summary, +135 comment=entry.get('arxiv_comment'), +136 journal_ref=entry.get('arxiv_journal_ref'), +137 doi=entry.get('arxiv_doi'), +138 primary_category=entry.arxiv_primary_category.get('term'), +139 categories=[tag.get('term') for tag in entry.tags], +140 links=[Result.Link._from_feed_link(link) for link in entry.links], +141 _raw=entry +142 ) +143 +144 def __str__(self) -> str: +145 return self.entry_id +146 +147 def __repr__(self) -> str: +148 return ( +149 '{}(entry_id={}, updated={}, published={}, title={}, authors={}, ' +150 'summary={}, comment={}, journal_ref={}, doi={}, ' +151 'primary_category={}, categories={}, links={})' +152 ).format( +153 _classname(self), +154 repr(self.entry_id), +155 repr(self.updated), +156 repr(self.published), +157 repr(self.title), +158 repr(self.authors), +159 repr(self.summary), +160 repr(self.comment), +161 repr(self.journal_ref), +162 repr(self.doi), +163 repr(self.primary_category), +164 repr(self.categories), +165 repr(self.links) +166 ) +167 +168 def __eq__(self, other) -> bool: +169 if isinstance(other, Result): +170 return self.entry_id == other.entry_id +171 return False +172 +173 def get_short_id(self) -> str: +174 """ +175 Returns the short ID for this result. +176 +177 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, +178 `result.get_short_id()` returns `2107.05580v1`. +179 +180 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, +181 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March +182 2007 arXiv identifier format). +183 +184 For an explanation of the difference between arXiv's legacy and current +185 identifiers, see [Understanding the arXiv +186 identifier](https://arxiv.org/help/arxiv_identifier). +187 """ +188 return self.entry_id.split('arxiv.org/abs/')[-1] +189 +190 def _get_default_filename(self, extension: str = "pdf") -> str: +191 """ +192 A default `to_filename` function for the extension given. +193 """ +194 nonempty_title = self.title if self.title else "UNTITLED" +195 # Remove disallowed characters. +196 clean_title = '_'.join(re.findall(r'\w+', nonempty_title)) +197 return "{}.{}.{}".format(self.get_short_id(), clean_title, extension) +198 +199 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: +200 """ +201 Downloads the PDF for this result to the specified directory. +202 +203 The filename is generated by calling `to_filename(self)`. +204 """ +205 if not filename: +206 filename = self._get_default_filename() +207 path = os.path.join(dirpath, filename) +208 written_path, _ = urlretrieve(self.pdf_url, path) +209 return written_path +210 +211 def download_source(self, dirpath: str = './', filename: str = '') -> str: +212 """ +213 Downloads the source tarfile for this result to the specified +214 directory. +215 +216 The filename is generated by calling `to_filename(self)`. +217 """ +218 if not filename: +219 filename = self._get_default_filename('tar.gz') +220 path = os.path.join(dirpath, filename) +221 # Bodge: construct the source URL from the PDF URL. +222 source_url = self.pdf_url.replace('/pdf/', '/src/') +223 written_path, _ = urlretrieve(source_url, path) +224 return written_path +225 +226 def _get_pdf_url(links: List[Link]) -> str: +227 """ +228 Finds the PDF link among a result's links and returns its URL. +229 +230 Should only be called once for a given `Result`, in its constructor. +231 After construction, the URL should be available in `Result.pdf_url`. +232 """ +233 pdf_urls = [link.href for link in links if link.title == 'pdf'] +234 if len(pdf_urls) == 0: +235 return None +236 elif len(pdf_urls) > 1: +237 logger.warning( +238 "Result has multiple PDF links; using %s", +239 pdf_urls[0] +240 ) +241 return pdf_urls[0] +242 +243 def _to_datetime(ts: time.struct_time) -> datetime: +244 """ +245 Converts a UTC time.struct_time into a time-zone-aware datetime. +246 +247 This will be replaced with feedparser functionality [when it becomes +248 available](https://github.com/kurtmckee/feedparser/issues/212). +249 """ +250 return datetime.fromtimestamp(timegm(ts), tz=timezone.utc) +251 +252 class Author(object): +253 """ +254 A light inner class for representing a result's authors. +255 """ +256 +257 name: str +258 """The author's name.""" +259 +260 def __init__(self, name: str): +261 """ +262 Constructs an `Author` with the specified name. +263 +264 In most cases, prefer using `Author._from_feed_author` to parsing +265 and constructing `Author`s yourself. +266 """ +267 self.name = name +268 +269 def _from_feed_author( +270 feed_author: feedparser.FeedParserDict +271 ) -> Result.Author: +272 """ +273 Constructs an `Author` with the name specified in an author object +274 from a feed entry. +275 +276 See usage in `Result._from_feed_entry`. +277 """ +278 return Result.Author(feed_author.name) +279 +280 def __str__(self) -> str: +281 return self.name +282 +283 def __repr__(self) -> str: +284 return '{}({})'.format(_classname(self), repr(self.name)) +285 +286 def __eq__(self, other) -> bool: +287 if isinstance(other, Result.Author): +288 return self.name == other.name +289 return False +290 +291 class Link(object): +292 """ +293 A light inner class for representing a result's links. +294 """ +295 +296 href: str +297 """The link's `href` attribute.""" +298 title: str +299 """The link's title.""" +300 rel: str +301 """The link's relationship to the `Result`.""" +302 content_type: str +303 """The link's HTTP content type.""" +304 +305 def __init__( +306 self, +307 href: str, +308 title: str = None, +309 rel: str = None, +310 content_type: str = None +311 ): +312 """ +313 Constructs a `Link` with the specified link metadata. +314 +315 In most cases, prefer using `Link._from_feed_link` to parsing and +316 constructing `Link`s yourself. +317 """ +318 self.href = href +319 self.title = title +320 self.rel = rel +321 self.content_type = content_type +322 +323 def _from_feed_link( +324 feed_link: feedparser.FeedParserDict +325 ) -> Result.Link: +326 """ +327 Constructs a `Link` with link metadata specified in a link object +328 from a feed entry. +329 +330 See usage in `Result._from_feed_entry`. +331 """ +332 return Result.Link( +333 href=feed_link.href, +334 title=feed_link.get('title'), +335 rel=feed_link.get('rel'), +336 content_type=feed_link.get('content_type') +337 ) +338 +339 def __str__(self) -> str: +340 return self.href +341 +342 def __repr__(self) -> str: +343 return '{}({}, title={}, rel={}, content_type={})'.format( +344 _classname(self), +345 repr(self.href), +346 repr(self.title), +347 repr(self.rel), +348 repr(self.content_type) +349 ) +350 +351 def __eq__(self, other) -> bool: +352 if isinstance(other, Result.Link): +353 return self.href == other.href +354 return False +355 +356 class MissingFieldError(Exception): +357 """ +358 An error indicating an entry is unparseable because it lacks required +359 fields. +360 """ +361 +362 missing_field: str +363 """The required field missing from the would-be entry.""" +364 message: str +365 """Message describing what caused this error.""" +366 +367 def __init__(self, missing_field): +368 self.missing_field = missing_field +369 self.message = "Entry from arXiv missing required info" +370 +371 def __repr__(self) -> str: +372 return '{}({})'.format( +373 _classname(self), +374 repr(self.missing_field) +375 ) +376 +377 +378class SortCriterion(Enum): +379 """ +380 A SortCriterion identifies a property by which search results can be +381 sorted. +382 +383 See [the arXiv API User's Manual: sort order for return +384 results](https://arxiv.org/help/api/user-manual#sort). +385 """ +386 Relevance = "relevance" +387 LastUpdatedDate = "lastUpdatedDate" +388 SubmittedDate = "submittedDate" +389 +390 +391class SortOrder(Enum): +392 """ +393 A SortOrder indicates order in which search results are sorted according +394 to the specified arxiv.SortCriterion. +395 +396 See [the arXiv API User's Manual: sort order for return +397 results](https://arxiv.org/help/api/user-manual#sort). +398 """ +399 Ascending = "ascending" +400 Descending = "descending" +401 +402 +403class Search(object): +404 """ +405 A specification for a search of arXiv's database. +406 +407 To run a search, use `Search.run` to use a default client or `Client.run` +408 with a specific client. +409 """ +410 +411 query: str +412 """ +413 A query string. +414 +415 This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not +416 `au:del_maestro+AND+ti:checkerboard`. +417 +418 See [the arXiv API User's Manual: Details of Query +419 Construction](https://arxiv.org/help/api/user-manual#query_details). +420 """ +421 id_list: List[str] +422 """ +423 A list of arXiv article IDs to which to limit the search. +424 +425 See [the arXiv API User's +426 Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) +427 for documentation of the interaction between `query` and `id_list`. +428 """ +429 max_results: float +430 """ +431 The maximum number of results to be returned in an execution of this +432 search. +433 +434 To fetch every result available, set `max_results=float('inf')`. +435 """ +436 sort_by: SortCriterion +437 """The sort criterion for results.""" +438 sort_order: SortOrder +439 """The sort order for results.""" +440 +441 def __init__( +442 self, +443 query: str = "", +444 id_list: List[str] = [], +445 max_results: float = float('inf'), +446 sort_by: SortCriterion = SortCriterion.Relevance, +447 sort_order: SortOrder = SortOrder.Descending +448 ): +449 """ +450 Constructs an arXiv API search with the specified criteria. +451 """ +452 self.query = query +453 self.id_list = id_list +454 self.max_results = max_results +455 self.sort_by = sort_by +456 self.sort_order = sort_order +457 +458 def __str__(self) -> str: +459 # TODO: develop a more informative string representation. +460 return repr(self) +461 +462 def __repr__(self) -> str: +463 return ( +464 '{}(query={}, id_list={}, max_results={}, sort_by={}, ' +465 'sort_order={})' +466 ).format( +467 _classname(self), +468 repr(self.query), +469 repr(self.id_list), +470 repr(self.max_results), +471 repr(self.sort_by), +472 repr(self.sort_order) +473 ) +474 +475 def _url_args(self) -> Dict[str, str]: +476 """ +477 Returns a dict of search parameters that should be included in an API +478 request for this search. +479 """ +480 return { +481 "search_query": self.query, +482 "id_list": ','.join(self.id_list), +483 "sortBy": self.sort_by.value, +484 "sortOrder": self.sort_order.value +485 } +486 +487 def get(self) -> Generator[Result, None, None]: +488 """ +489 **Deprecated** after 1.2.0; use `Search.results`. +490 """ +491 warnings.warn( +492 "The 'get' method is deprecated, use 'results' instead", +493 DeprecationWarning, +494 stacklevel=2 +495 ) +496 return self.results() +497 +498 def results(self, offset: int = 0) -> Generator[Result, None, None]: +499 """ +500 Executes the specified search using a default arXiv API client. +501 +502 For info on default behavior, see `Client.__init__` and `Client.results`. +503 """ +504 return Client().results(self, offset=offset) +505 +506 +507class Client(object): +508 """ +509 Specifies a strategy for fetching results from arXiv's API. +510 +511 This class obscures pagination and retry logic, and exposes +512 `Client.results`. +513 """ +514 +515 query_url_format = 'http://export.arxiv.org/api/query?{}' +516 """The arXiv query API endpoint format.""" +517 page_size: int +518 """Maximum number of results fetched in a single API request.""" +519 delay_seconds: int +520 """Number of seconds to wait between API requests.""" +521 num_retries: int +522 """Number of times to retry a failing API request.""" +523 _last_request_dt: datetime +524 +525 def __init__( +526 self, +527 page_size: int = 100, +528 delay_seconds: int = 3, +529 num_retries: int = 3 +530 ): +531 """ +532 Constructs an arXiv API client with the specified options. +533 +534 Note: the default parameters should provide a robust request strategy +535 for most use cases. Extreme page sizes, delays, or retries risk +536 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), +537 brittle behavior, and inconsistent results. +538 """ +539 self.page_size = page_size +540 self.delay_seconds = delay_seconds +541 self.num_retries = num_retries +542 self._last_request_dt = None +543 +544 def __str__(self) -> str: +545 # TODO: develop a more informative string representation. +546 return repr(self) +547 +548 def __repr__(self) -> str: +549 return '{}(page_size={}, delay_seconds={}, num_retries={})'.format( +550 _classname(self), +551 repr(self.page_size), +552 repr(self.delay_seconds), +553 repr(self.num_retries) +554 ) +555 +556 def get(self, search: Search) -> Generator[Result, None, None]: +557 """ +558 **Deprecated** after 1.2.0; use `Client.results`. +559 """ +560 warnings.warn( +561 "The 'get' method is deprecated, use 'results' instead", +562 DeprecationWarning, +563 stacklevel=2 +564 ) +565 return self.results(search) +566 +567 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: +568 """ +569 Uses this client configuration to fetch one page of the search results +570 at a time, yielding the parsed `Result`s, until `max_results` results +571 have been yielded or there are no more search results. 572 -573 Setting a nonzero `offset` discards leading records in the result set. -574 When `offset` is greater than or equal to `search.max_results`, the full -575 result set is discarded. -576 -577 For more on using generators, see -578 [Generators](https://wiki.python.org/moin/Generators). -579 """ -580 -581 # total_results may be reduced according to the feed's -582 # opensearch:totalResults value. -583 total_results = search.max_results -584 first_page = True -585 while offset < total_results: -586 page_size = min(self.page_size, search.max_results - offset) -587 logger.info("Requesting {} results at offset {}".format( -588 page_size, -589 offset, -590 )) -591 page_url = self._format_url(search, offset, page_size) -592 feed = self._parse_feed(page_url, first_page) -593 if first_page: -594 # NOTE: this is an ugly fix for a known bug. The totalresults -595 # value is set to 1 for results with zero entries. If that API -596 # bug is fixed, we can remove this conditional and always set -597 # `total_results = min(...)`. -598 if len(feed.entries) == 0: -599 logger.info("Got empty results; stopping generation") -600 total_results = 0 -601 else: -602 total_results = min( -603 total_results, -604 int(feed.feed.opensearch_totalresults) -605 ) -606 logger.info("Got first page; {} of {} results available".format( -607 total_results, -608 search.max_results -609 )) -610 # Subsequent pages are not the first page. -611 first_page = False -612 # Update offset for next request: account for received results. -613 offset += len(feed.entries) -614 # Yield query results until page is exhausted. -615 for entry in feed.entries: -616 try: -617 yield Result._from_feed_entry(entry) -618 except Result.MissingFieldError: -619 logger.warning("Skipping partial result") -620 continue -621 -622 def _format_url(self, search: Search, start: int, page_size: int) -> str: -623 """ -624 Construct a request API for search that returns up to `page_size` -625 results starting with the result at index `start`. -626 """ -627 url_args = search._url_args() -628 url_args.update({ -629 "start": start, -630 "max_results": page_size, -631 }) -632 return self.query_url_format.format(urlencode(url_args)) -633 -634 def _parse_feed( -635 self, -636 url: str, -637 first_page: bool = True -638 ) -> feedparser.FeedParserDict: -639 """ -640 Fetches the specified URL and parses it with feedparser. -641 -642 If a request fails or is unexpectedly empty, retries the request up to -643 `self.num_retries` times. -644 """ -645 # Invoke the recursive helper with initial available retries. -646 return self.__try_parse_feed( -647 url, -648 first_page=first_page, -649 retries_left=self.num_retries -650 ) -651 -652 def __try_parse_feed( -653 self, -654 url: str, -655 first_page: bool, -656 retries_left: int, -657 last_err: Exception = None, -658 ) -> feedparser.FeedParserDict: -659 """ -660 Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that -661 number of seconds has not passed since `_parse_feed` was last called, -662 sleeps until delay_seconds seconds have passed. -663 """ -664 retry = self.num_retries - retries_left -665 # If this call would violate the rate limit, sleep until it doesn't. -666 if self._last_request_dt is not None: -667 required = timedelta(seconds=self.delay_seconds) -668 since_last_request = datetime.now() - self._last_request_dt -669 if since_last_request < required: -670 to_sleep = (required - since_last_request).total_seconds() -671 logger.info("Sleeping for %f seconds", to_sleep) -672 time.sleep(to_sleep) -673 logger.info("Requesting page of results", extra={ -674 'url': url, -675 'first_page': first_page, -676 'retry': retry, -677 'last_err': last_err.message if last_err is not None else None, -678 }) -679 feed = feedparser.parse(url) -680 self._last_request_dt = datetime.now() -681 err = None -682 if feed.status != 200: -683 err = HTTPError(url, retry, feed) -684 elif len(feed.entries) == 0 and not first_page: -685 err = UnexpectedEmptyPageError(url, retry) -686 if err is not None: -687 if retries_left > 0: -688 return self.__try_parse_feed( -689 url, -690 first_page=first_page, -691 retries_left=retries_left-1, -692 last_err=err, -693 ) -694 # Feed was never returned in self.num_retries tries. Raise the last -695 # exception encountered. -696 raise err -697 return feed -698 -699 -700class ArxivError(Exception): -701 """This package's base Exception class.""" -702 -703 url: str -704 """The feed URL that could not be fetched.""" -705 retry: int -706 """ -707 The request try number which encountered this error; 0 for the initial try, -708 1 for the first retry, and so on. -709 """ -710 message: str -711 """Message describing what caused this error.""" -712 -713 def __init__(self, url: str, retry: int, message: str): -714 """ -715 Constructs an `ArxivError` encountered while fetching the specified URL. -716 """ -717 self.url = url -718 self.retry = retry -719 self.message = message -720 super().__init__(self.message) -721 -722 def __str__(self) -> str: -723 return '{} ({})'.format(self.message, self.url) -724 -725 -726class UnexpectedEmptyPageError(ArxivError): -727 """ -728 An error raised when a page of results that should be non-empty is empty. -729 -730 This should never happen in theory, but happens sporadically due to -731 brittleness in the underlying arXiv API; usually resolved by retries. -732 -733 See `Client.results` for usage. -734 """ -735 def __init__(self, url: str, retry: int): -736 """ -737 Constructs an `UnexpectedEmptyPageError` encountered for the specified -738 API URL after `retry` tries. -739 """ -740 self.url = url -741 super().__init__(url, retry, "Page of results was unexpectedly empty") -742 -743 def __repr__(self) -> str: -744 return '{}({}, {})'.format( -745 _classname(self), -746 repr(self.url), -747 repr(self.retry) -748 ) -749 -750 -751class HTTPError(ArxivError): -752 """ -753 A non-200 status encountered while fetching a page of results. -754 -755 See `Client.results` for usage. -756 """ -757 -758 status: int -759 """The HTTP status reported by feedparser.""" -760 entry: feedparser.FeedParserDict -761 """The feed entry describing the error, if present.""" -762 -763 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): -764 """ -765 Constructs an `HTTPError` for the specified status code, encountered for -766 the specified API URL after `retry` tries. -767 """ -768 self.url = url -769 self.status = feed.status -770 # If the feed is valid and includes a single entry, trust it's an -771 # explanation. -772 if not feed.bozo and len(feed.entries) == 1: -773 self.entry = feed.entries[0] -774 else: -775 self.entry = None -776 super().__init__( -777 url, -778 retry, -779 "Page request resulted in HTTP {}: {}".format( -780 self.status, -781 self.entry.summary if self.entry else None, -782 ), -783 ) -784 -785 def __repr__(self) -> str: -786 return '{}({}, {}, {})'.format( -787 _classname(self), -788 repr(self.url), -789 repr(self.retry), -790 repr(self.status) -791 ) -792 -793 -794def _classname(o): -795 """A helper function for use in __repr__ methods: arxiv.Result.Link.""" -796 return 'arxiv.{}'.format(o.__class__.__qualname__) +573 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. +574 +575 Setting a nonzero `offset` discards leading records in the result set. +576 When `offset` is greater than or equal to `search.max_results`, the full +577 result set is discarded. +578 +579 For more on using generators, see +580 [Generators](https://wiki.python.org/moin/Generators). +581 """ +582 +583 # total_results may be reduced according to the feed's +584 # opensearch:totalResults value. +585 total_results = search.max_results +586 first_page = True +587 while offset < total_results: +588 page_size = min(self.page_size, search.max_results - offset) +589 logger.info("Requesting {} results at offset {}".format( +590 page_size, +591 offset, +592 )) +593 page_url = self._format_url(search, offset, page_size) +594 feed = self._parse_feed(page_url, first_page) +595 if first_page: +596 # NOTE: this is an ugly fix for a known bug. The totalresults +597 # value is set to 1 for results with zero entries. If that API +598 # bug is fixed, we can remove this conditional and always set +599 # `total_results = min(...)`. +600 if len(feed.entries) == 0: +601 logger.info("Got empty results; stopping generation") +602 total_results = 0 +603 else: +604 total_results = min( +605 total_results, +606 int(feed.feed.opensearch_totalresults) +607 ) +608 logger.info("Got first page; {} of {} results available".format( +609 total_results, +610 search.max_results +611 )) +612 # Subsequent pages are not the first page. +613 first_page = False +614 # Update offset for next request: account for received results. +615 offset += len(feed.entries) +616 # Yield query results until page is exhausted. +617 for entry in feed.entries: +618 try: +619 yield Result._from_feed_entry(entry) +620 except Result.MissingFieldError: +621 logger.warning("Skipping partial result") +622 continue +623 +624 def _format_url(self, search: Search, start: int, page_size: int) -> str: +625 """ +626 Construct a request API for search that returns up to `page_size` +627 results starting with the result at index `start`. +628 """ +629 url_args = search._url_args() +630 url_args.update({ +631 "start": start, +632 "max_results": page_size, +633 }) +634 return self.query_url_format.format(urlencode(url_args)) +635 +636 def _parse_feed( +637 self, +638 url: str, +639 first_page: bool = True +640 ) -> feedparser.FeedParserDict: +641 """ +642 Fetches the specified URL and parses it with feedparser. +643 +644 If a request fails or is unexpectedly empty, retries the request up to +645 `self.num_retries` times. +646 """ +647 # Invoke the recursive helper with initial available retries. +648 return self.__try_parse_feed( +649 url, +650 first_page=first_page, +651 retries_left=self.num_retries +652 ) +653 +654 def __try_parse_feed( +655 self, +656 url: str, +657 first_page: bool, +658 retries_left: int, +659 last_err: Exception = None, +660 ) -> feedparser.FeedParserDict: +661 """ +662 Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that +663 number of seconds has not passed since `_parse_feed` was last called, +664 sleeps until delay_seconds seconds have passed. +665 """ +666 retry = self.num_retries - retries_left +667 # If this call would violate the rate limit, sleep until it doesn't. +668 if self._last_request_dt is not None: +669 required = timedelta(seconds=self.delay_seconds) +670 since_last_request = datetime.now() - self._last_request_dt +671 if since_last_request < required: +672 to_sleep = (required - since_last_request).total_seconds() +673 logger.info("Sleeping for %f seconds", to_sleep) +674 time.sleep(to_sleep) +675 logger.info("Requesting page of results", extra={ +676 'url': url, +677 'first_page': first_page, +678 'retry': retry, +679 'last_err': last_err.message if last_err is not None else None, +680 }) +681 feed = feedparser.parse(url) +682 self._last_request_dt = datetime.now() +683 err = None +684 if feed.status != 200: +685 err = HTTPError(url, retry, feed) +686 elif len(feed.entries) == 0 and not first_page: +687 err = UnexpectedEmptyPageError(url, retry) +688 if err is not None: +689 if retries_left > 0: +690 return self.__try_parse_feed( +691 url, +692 first_page=first_page, +693 retries_left=retries_left-1, +694 last_err=err, +695 ) +696 # Feed was never returned in self.num_retries tries. Raise the last +697 # exception encountered. +698 raise err +699 return feed +700 +701 +702class ArxivError(Exception): +703 """This package's base Exception class.""" +704 +705 url: str +706 """The feed URL that could not be fetched.""" +707 retry: int +708 """ +709 The request try number which encountered this error; 0 for the initial try, +710 1 for the first retry, and so on. +711 """ +712 message: str +713 """Message describing what caused this error.""" +714 +715 def __init__(self, url: str, retry: int, message: str): +716 """ +717 Constructs an `ArxivError` encountered while fetching the specified URL. +718 """ +719 self.url = url +720 self.retry = retry +721 self.message = message +722 super().__init__(self.message) +723 +724 def __str__(self) -> str: +725 return '{} ({})'.format(self.message, self.url) +726 +727 +728class UnexpectedEmptyPageError(ArxivError): +729 """ +730 An error raised when a page of results that should be non-empty is empty. +731 +732 This should never happen in theory, but happens sporadically due to +733 brittleness in the underlying arXiv API; usually resolved by retries. +734 +735 See `Client.results` for usage. +736 """ +737 def __init__(self, url: str, retry: int): +738 """ +739 Constructs an `UnexpectedEmptyPageError` encountered for the specified +740 API URL after `retry` tries. +741 """ +742 self.url = url +743 super().__init__(url, retry, "Page of results was unexpectedly empty") +744 +745 def __repr__(self) -> str: +746 return '{}({}, {})'.format( +747 _classname(self), +748 repr(self.url), +749 repr(self.retry) +750 ) +751 +752 +753class HTTPError(ArxivError): +754 """ +755 A non-200 status encountered while fetching a page of results. +756 +757 See `Client.results` for usage. +758 """ +759 +760 status: int +761 """The HTTP status reported by feedparser.""" +762 entry: feedparser.FeedParserDict +763 """The feed entry describing the error, if present.""" +764 +765 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): +766 """ +767 Constructs an `HTTPError` for the specified status code, encountered for +768 the specified API URL after `retry` tries. +769 """ +770 self.url = url +771 self.status = feed.status +772 # If the feed is valid and includes a single entry, trust it's an +773 # explanation. +774 if not feed.bozo and len(feed.entries) == 1: +775 self.entry = feed.entries[0] +776 else: +777 self.entry = None +778 super().__init__( +779 url, +780 retry, +781 "Page request resulted in HTTP {}: {}".format( +782 self.status, +783 self.entry.summary if self.entry else None, +784 ), +785 ) +786 +787 def __repr__(self) -> str: +788 return '{}({}, {}, {})'.format( +789 _classname(self), +790 repr(self.url), +791 repr(self.retry), +792 repr(self.status) +793 ) +794 +795 +796def _classname(o): +797 """A helper function for use in __repr__ methods: arxiv.Result.Link.""" +798 return 'arxiv.{}'.format(o.__class__.__qualname__)
23class Result(object): - 24 """ - 25 An entry in an arXiv query results feed. - 26 - 27 See [the arXiv API User's Manual: Details of Atom Results - 28 Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). - 29 """ - 30 - 31 entry_id: str - 32 """A url of the form `http://arxiv.org/abs/{id}`.""" - 33 updated: datetime - 34 """When the result was last updated.""" - 35 published: datetime - 36 """When the result was originally published.""" - 37 title: str - 38 """The title of the result.""" - 39 authors: list - 40 """The result's authors.""" - 41 summary: str - 42 """The result abstract.""" - 43 comment: str - 44 """The authors' comment if present.""" - 45 journal_ref: str - 46 """A journal reference if present.""" - 47 doi: str - 48 """A URL for the resolved DOI to an external resource if present.""" - 49 primary_category: str - 50 """ - 51 The result's primary arXiv category. See [arXiv: Category - 52 Taxonomy](https://arxiv.org/category_taxonomy). - 53 """ - 54 categories: List[str] - 55 """ - 56 All of the result's categories. See [arXiv: Category - 57 Taxonomy](https://arxiv.org/category_taxonomy). - 58 """ - 59 links: list - 60 """Up to three URLs associated with this result.""" - 61 pdf_url: str - 62 """The URL of a PDF version of this result if present among links.""" - 63 _raw: feedparser.FeedParserDict - 64 """ - 65 The raw feedparser result object if this Result was constructed with - 66 Result._from_feed_entry. - 67 """ - 68 - 69 def __init__( - 70 self, - 71 entry_id: str, - 72 updated: datetime = _DEFAULT_TIME, - 73 published: datetime = _DEFAULT_TIME, - 74 title: str = "", - 75 authors: List['Result.Author'] = [], - 76 summary: str = "", - 77 comment: str = "", - 78 journal_ref: str = "", - 79 doi: str = "", - 80 primary_category: str = "", - 81 categories: List[str] = [], - 82 links: List['Result.Link'] = [], - 83 _raw: feedparser.FeedParserDict = None, - 84 ): - 85 """ - 86 Constructs an arXiv search result item. - 87 - 88 In most cases, prefer using `Result._from_feed_entry` to parsing and - 89 constructing `Result`s yourself. - 90 """ - 91 self.entry_id = entry_id - 92 self.updated = updated - 93 self.published = published - 94 self.title = title - 95 self.authors = authors - 96 self.summary = summary - 97 self.comment = comment - 98 self.journal_ref = journal_ref - 99 self.doi = doi -100 self.primary_category = primary_category -101 self.categories = categories -102 self.links = links -103 # Calculated members -104 self.pdf_url = Result._get_pdf_url(links) -105 # Debugging -106 self._raw = _raw -107 -108 def _from_feed_entry(entry: feedparser.FeedParserDict) -> 'Result': -109 """ -110 Converts a feedparser entry for an arXiv search result feed into a -111 Result object. -112 """ -113 if not hasattr(entry, "id"): -114 raise Result.MissingFieldError("id") -115 # Title attribute may be absent for certain titles. Defaulting to "0" as -116 # it's the only title observed to cause this bug. -117 # https://github.com/lukasschwab/arxiv.py/issues/71 -118 # title = entry.title if hasattr(entry, "title") else "0" -119 title = "0" -120 if hasattr(entry, "title"): -121 title = entry.title -122 else: -123 logger.warning( -124 "Result %s is missing title attribute; defaulting to '0'", -125 entry.id -126 ) -127 return Result( -128 entry_id=entry.id, -129 updated=Result._to_datetime(entry.updated_parsed), -130 published=Result._to_datetime(entry.published_parsed), -131 title=re.sub(r'\s+', ' ', title), -132 authors=[Result.Author._from_feed_author(a) for a in entry.authors], -133 summary=entry.summary, -134 comment=entry.get('arxiv_comment'), -135 journal_ref=entry.get('arxiv_journal_ref'), -136 doi=entry.get('arxiv_doi'), -137 primary_category=entry.arxiv_primary_category.get('term'), -138 categories=[tag.get('term') for tag in entry.tags], -139 links=[Result.Link._from_feed_link(link) for link in entry.links], -140 _raw=entry -141 ) -142 -143 def __str__(self) -> str: -144 return self.entry_id -145 -146 def __repr__(self) -> str: -147 return ( -148 '{}(entry_id={}, updated={}, published={}, title={}, authors={}, ' -149 'summary={}, comment={}, journal_ref={}, doi={}, ' -150 'primary_category={}, categories={}, links={})' -151 ).format( -152 _classname(self), -153 repr(self.entry_id), -154 repr(self.updated), -155 repr(self.published), -156 repr(self.title), -157 repr(self.authors), -158 repr(self.summary), -159 repr(self.comment), -160 repr(self.journal_ref), -161 repr(self.doi), -162 repr(self.primary_category), -163 repr(self.categories), -164 repr(self.links) -165 ) -166 -167 def __eq__(self, other) -> bool: -168 if isinstance(other, Result): -169 return self.entry_id == other.entry_id -170 return False -171 -172 def get_short_id(self) -> str: -173 """ -174 Returns the short ID for this result. -175 -176 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, -177 `result.get_short_id()` returns `2107.05580v1`. -178 -179 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, -180 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March -181 2007 arXiv identifier format). -182 -183 For an explanation of the difference between arXiv's legacy and current -184 identifiers, see [Understanding the arXiv -185 identifier](https://arxiv.org/help/arxiv_identifier). -186 """ -187 return self.entry_id.split('arxiv.org/abs/')[-1] -188 -189 def _get_default_filename(self, extension: str = "pdf") -> str: -190 """ -191 A default `to_filename` function for the extension given. -192 """ -193 nonempty_title = self.title if self.title else "UNTITLED" -194 # Remove disallowed characters. -195 clean_title = '_'.join(re.findall(r'\w+', nonempty_title)) -196 return "{}.{}.{}".format(self.get_short_id(), clean_title, extension) -197 -198 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: -199 """ -200 Downloads the PDF for this result to the specified directory. -201 -202 The filename is generated by calling `to_filename(self)`. -203 """ -204 if not filename: -205 filename = self._get_default_filename() -206 path = os.path.join(dirpath, filename) -207 written_path, _ = urlretrieve(self.pdf_url, path) -208 return written_path -209 -210 def download_source(self, dirpath: str = './', filename: str = '') -> str: -211 """ -212 Downloads the source tarfile for this result to the specified -213 directory. -214 -215 The filename is generated by calling `to_filename(self)`. -216 """ -217 if not filename: -218 filename = self._get_default_filename('tar.gz') -219 path = os.path.join(dirpath, filename) -220 # Bodge: construct the source URL from the PDF URL. -221 source_url = self.pdf_url.replace('/pdf/', '/src/') -222 written_path, _ = urlretrieve(source_url, path) -223 return written_path -224 -225 def _get_pdf_url(links: list) -> str: -226 """ -227 Finds the PDF link among a result's links and returns its URL. -228 -229 Should only be called once for a given `Result`, in its constructor. -230 After construction, the URL should be available in `Result.pdf_url`. -231 """ -232 pdf_urls = [link.href for link in links if link.title == 'pdf'] -233 if len(pdf_urls) == 0: -234 return None -235 elif len(pdf_urls) > 1: -236 logger.warning( -237 "Result has multiple PDF links; using %s", -238 pdf_urls[0] -239 ) -240 return pdf_urls[0] -241 -242 def _to_datetime(ts: time.struct_time) -> datetime: -243 """ -244 Converts a UTC time.struct_time into a time-zone-aware datetime. -245 -246 This will be replaced with feedparser functionality [when it becomes -247 available](https://github.com/kurtmckee/feedparser/issues/212). -248 """ -249 return datetime.fromtimestamp(timegm(ts), tz=timezone.utc) -250 -251 class Author(object): -252 """ -253 A light inner class for representing a result's authors. -254 """ -255 -256 name: str -257 """The author's name.""" -258 -259 def __init__(self, name: str): -260 """ -261 Constructs an `Author` with the specified name. -262 -263 In most cases, prefer using `Author._from_feed_author` to parsing -264 and constructing `Author`s yourself. -265 """ -266 self.name = name -267 -268 def _from_feed_author( -269 feed_author: feedparser.FeedParserDict -270 ) -> 'Result.Author': -271 """ -272 Constructs an `Author` with the name specified in an author object -273 from a feed entry. -274 -275 See usage in `Result._from_feed_entry`. -276 """ -277 return Result.Author(feed_author.name) -278 -279 def __str__(self) -> str: -280 return self.name -281 -282 def __repr__(self) -> str: -283 return '{}({})'.format(_classname(self), repr(self.name)) -284 -285 def __eq__(self, other) -> bool: -286 if isinstance(other, Result.Author): -287 return self.name == other.name -288 return False -289 -290 class Link(object): -291 """ -292 A light inner class for representing a result's links. -293 """ -294 -295 href: str -296 """The link's `href` attribute.""" -297 title: str -298 """The link's title.""" -299 rel: str -300 """The link's relationship to the `Result`.""" -301 content_type: str -302 """The link's HTTP content type.""" -303 -304 def __init__( -305 self, -306 href: str, -307 title: str = None, -308 rel: str = None, -309 content_type: str = None -310 ): -311 """ -312 Constructs a `Link` with the specified link metadata. -313 -314 In most cases, prefer using `Link._from_feed_link` to parsing and -315 constructing `Link`s yourself. -316 """ -317 self.href = href -318 self.title = title -319 self.rel = rel -320 self.content_type = content_type -321 -322 def _from_feed_link( -323 feed_link: feedparser.FeedParserDict -324 ) -> 'Result.Link': -325 """ -326 Constructs a `Link` with link metadata specified in a link object -327 from a feed entry. -328 -329 See usage in `Result._from_feed_entry`. -330 """ -331 return Result.Link( -332 href=feed_link.href, -333 title=feed_link.get('title'), -334 rel=feed_link.get('rel'), -335 content_type=feed_link.get('content_type') -336 ) -337 -338 def __str__(self) -> str: -339 return self.href -340 -341 def __repr__(self) -> str: -342 return '{}({}, title={}, rel={}, content_type={})'.format( -343 _classname(self), -344 repr(self.href), -345 repr(self.title), -346 repr(self.rel), -347 repr(self.content_type) -348 ) -349 -350 def __eq__(self, other) -> bool: -351 if isinstance(other, Result.Link): -352 return self.href == other.href -353 return False -354 -355 class MissingFieldError(Exception): -356 """ -357 An error indicating an entry is unparseable because it lacks required -358 fields. -359 """ -360 -361 missing_field: str -362 """The required field missing from the would-be entry.""" -363 message: str -364 """Message describing what caused this error.""" -365 -366 def __init__(self, missing_field): -367 self.missing_field = missing_field -368 self.message = "Entry from arXiv missing required info" -369 -370 def __repr__(self) -> str: -371 return '{}({})'.format( -372 _classname(self), -373 repr(self.missing_field) -374 ) +@@ -1673,44 +1675,44 @@25class Result(object): + 26 """ + 27 An entry in an arXiv query results feed. + 28 + 29 See [the arXiv API User's Manual: Details of Atom Results + 30 Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). + 31 """ + 32 + 33 entry_id: str + 34 """A url of the form `http://arxiv.org/abs/{id}`.""" + 35 updated: datetime + 36 """When the result was last updated.""" + 37 published: datetime + 38 """When the result was originally published.""" + 39 title: str + 40 """The title of the result.""" + 41 authors: List[Author] + 42 """The result's authors.""" + 43 summary: str + 44 """The result abstract.""" + 45 comment: str + 46 """The authors' comment if present.""" + 47 journal_ref: str + 48 """A journal reference if present.""" + 49 doi: str + 50 """A URL for the resolved DOI to an external resource if present.""" + 51 primary_category: str + 52 """ + 53 The result's primary arXiv category. See [arXiv: Category + 54 Taxonomy](https://arxiv.org/category_taxonomy). + 55 """ + 56 categories: List[str] + 57 """ + 58 All of the result's categories. See [arXiv: Category + 59 Taxonomy](https://arxiv.org/category_taxonomy). + 60 """ + 61 links: List[Link] + 62 """Up to three URLs associated with this result.""" + 63 pdf_url: str + 64 """The URL of a PDF version of this result if present among links.""" + 65 _raw: feedparser.FeedParserDict + 66 """ + 67 The raw feedparser result object if this Result was constructed with + 68 Result._from_feed_entry. + 69 """ + 70 + 71 def __init__( + 72 self, + 73 entry_id: str, + 74 updated: datetime = _DEFAULT_TIME, + 75 published: datetime = _DEFAULT_TIME, + 76 title: str = "", + 77 authors: List[Author] = [], + 78 summary: str = "", + 79 comment: str = "", + 80 journal_ref: str = "", + 81 doi: str = "", + 82 primary_category: str = "", + 83 categories: List[str] = [], + 84 links: List[Link] = [], + 85 _raw: feedparser.FeedParserDict = None, + 86 ): + 87 """ + 88 Constructs an arXiv search result item. + 89 + 90 In most cases, prefer using `Result._from_feed_entry` to parsing and + 91 constructing `Result`s yourself. + 92 """ + 93 self.entry_id = entry_id + 94 self.updated = updated + 95 self.published = published + 96 self.title = title + 97 self.authors = authors + 98 self.summary = summary + 99 self.comment = comment +100 self.journal_ref = journal_ref +101 self.doi = doi +102 self.primary_category = primary_category +103 self.categories = categories +104 self.links = links +105 # Calculated members +106 self.pdf_url = Result._get_pdf_url(links) +107 # Debugging +108 self._raw = _raw +109 +110 def _from_feed_entry(entry: feedparser.FeedParserDict) -> Result: +111 """ +112 Converts a feedparser entry for an arXiv search result feed into a +113 Result object. +114 """ +115 if not hasattr(entry, "id"): +116 raise Result.MissingFieldError("id") +117 # Title attribute may be absent for certain titles. Defaulting to "0" as +118 # it's the only title observed to cause this bug. +119 # https://github.com/lukasschwab/arxiv.py/issues/71 +120 # title = entry.title if hasattr(entry, "title") else "0" +121 title = "0" +122 if hasattr(entry, "title"): +123 title = entry.title +124 else: +125 logger.warning( +126 "Result %s is missing title attribute; defaulting to '0'", +127 entry.id +128 ) +129 return Result( +130 entry_id=entry.id, +131 updated=Result._to_datetime(entry.updated_parsed), +132 published=Result._to_datetime(entry.published_parsed), +133 title=re.sub(r'\s+', ' ', title), +134 authors=[Result.Author._from_feed_author(a) for a in entry.authors], +135 summary=entry.summary, +136 comment=entry.get('arxiv_comment'), +137 journal_ref=entry.get('arxiv_journal_ref'), +138 doi=entry.get('arxiv_doi'), +139 primary_category=entry.arxiv_primary_category.get('term'), +140 categories=[tag.get('term') for tag in entry.tags], +141 links=[Result.Link._from_feed_link(link) for link in entry.links], +142 _raw=entry +143 ) +144 +145 def __str__(self) -> str: +146 return self.entry_id +147 +148 def __repr__(self) -> str: +149 return ( +150 '{}(entry_id={}, updated={}, published={}, title={}, authors={}, ' +151 'summary={}, comment={}, journal_ref={}, doi={}, ' +152 'primary_category={}, categories={}, links={})' +153 ).format( +154 _classname(self), +155 repr(self.entry_id), +156 repr(self.updated), +157 repr(self.published), +158 repr(self.title), +159 repr(self.authors), +160 repr(self.summary), +161 repr(self.comment), +162 repr(self.journal_ref), +163 repr(self.doi), +164 repr(self.primary_category), +165 repr(self.categories), +166 repr(self.links) +167 ) +168 +169 def __eq__(self, other) -> bool: +170 if isinstance(other, Result): +171 return self.entry_id == other.entry_id +172 return False +173 +174 def get_short_id(self) -> str: +175 """ +176 Returns the short ID for this result. +177 +178 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, +179 `result.get_short_id()` returns `2107.05580v1`. +180 +181 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, +182 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March +183 2007 arXiv identifier format). +184 +185 For an explanation of the difference between arXiv's legacy and current +186 identifiers, see [Understanding the arXiv +187 identifier](https://arxiv.org/help/arxiv_identifier). +188 """ +189 return self.entry_id.split('arxiv.org/abs/')[-1] +190 +191 def _get_default_filename(self, extension: str = "pdf") -> str: +192 """ +193 A default `to_filename` function for the extension given. +194 """ +195 nonempty_title = self.title if self.title else "UNTITLED" +196 # Remove disallowed characters. +197 clean_title = '_'.join(re.findall(r'\w+', nonempty_title)) +198 return "{}.{}.{}".format(self.get_short_id(), clean_title, extension) +199 +200 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: +201 """ +202 Downloads the PDF for this result to the specified directory. +203 +204 The filename is generated by calling `to_filename(self)`. +205 """ +206 if not filename: +207 filename = self._get_default_filename() +208 path = os.path.join(dirpath, filename) +209 written_path, _ = urlretrieve(self.pdf_url, path) +210 return written_path +211 +212 def download_source(self, dirpath: str = './', filename: str = '') -> str: +213 """ +214 Downloads the source tarfile for this result to the specified +215 directory. +216 +217 The filename is generated by calling `to_filename(self)`. +218 """ +219 if not filename: +220 filename = self._get_default_filename('tar.gz') +221 path = os.path.join(dirpath, filename) +222 # Bodge: construct the source URL from the PDF URL. +223 source_url = self.pdf_url.replace('/pdf/', '/src/') +224 written_path, _ = urlretrieve(source_url, path) +225 return written_path +226 +227 def _get_pdf_url(links: List[Link]) -> str: +228 """ +229 Finds the PDF link among a result's links and returns its URL. +230 +231 Should only be called once for a given `Result`, in its constructor. +232 After construction, the URL should be available in `Result.pdf_url`. +233 """ +234 pdf_urls = [link.href for link in links if link.title == 'pdf'] +235 if len(pdf_urls) == 0: +236 return None +237 elif len(pdf_urls) > 1: +238 logger.warning( +239 "Result has multiple PDF links; using %s", +240 pdf_urls[0] +241 ) +242 return pdf_urls[0] +243 +244 def _to_datetime(ts: time.struct_time) -> datetime: +245 """ +246 Converts a UTC time.struct_time into a time-zone-aware datetime. +247 +248 This will be replaced with feedparser functionality [when it becomes +249 available](https://github.com/kurtmckee/feedparser/issues/212). +250 """ +251 return datetime.fromtimestamp(timegm(ts), tz=timezone.utc) +252 +253 class Author(object): +254 """ +255 A light inner class for representing a result's authors. +256 """ +257 +258 name: str +259 """The author's name.""" +260 +261 def __init__(self, name: str): +262 """ +263 Constructs an `Author` with the specified name. +264 +265 In most cases, prefer using `Author._from_feed_author` to parsing +266 and constructing `Author`s yourself. +267 """ +268 self.name = name +269 +270 def _from_feed_author( +271 feed_author: feedparser.FeedParserDict +272 ) -> Result.Author: +273 """ +274 Constructs an `Author` with the name specified in an author object +275 from a feed entry. +276 +277 See usage in `Result._from_feed_entry`. +278 """ +279 return Result.Author(feed_author.name) +280 +281 def __str__(self) -> str: +282 return self.name +283 +284 def __repr__(self) -> str: +285 return '{}({})'.format(_classname(self), repr(self.name)) +286 +287 def __eq__(self, other) -> bool: +288 if isinstance(other, Result.Author): +289 return self.name == other.name +290 return False +291 +292 class Link(object): +293 """ +294 A light inner class for representing a result's links. +295 """ +296 +297 href: str +298 """The link's `href` attribute.""" +299 title: str +300 """The link's title.""" +301 rel: str +302 """The link's relationship to the `Result`.""" +303 content_type: str +304 """The link's HTTP content type.""" +305 +306 def __init__( +307 self, +308 href: str, +309 title: str = None, +310 rel: str = None, +311 content_type: str = None +312 ): +313 """ +314 Constructs a `Link` with the specified link metadata. +315 +316 In most cases, prefer using `Link._from_feed_link` to parsing and +317 constructing `Link`s yourself. +318 """ +319 self.href = href +320 self.title = title +321 self.rel = rel +322 self.content_type = content_type +323 +324 def _from_feed_link( +325 feed_link: feedparser.FeedParserDict +326 ) -> Result.Link: +327 """ +328 Constructs a `Link` with link metadata specified in a link object +329 from a feed entry. +330 +331 See usage in `Result._from_feed_entry`. +332 """ +333 return Result.Link( +334 href=feed_link.href, +335 title=feed_link.get('title'), +336 rel=feed_link.get('rel'), +337 content_type=feed_link.get('content_type') +338 ) +339 +340 def __str__(self) -> str: +341 return self.href +342 +343 def __repr__(self) -> str: +344 return '{}({}, title={}, rel={}, content_type={})'.format( +345 _classname(self), +346 repr(self.href), +347 repr(self.title), +348 repr(self.rel), +349 repr(self.content_type) +350 ) +351 +352 def __eq__(self, other) -> bool: +353 if isinstance(other, Result.Link): +354 return self.href == other.href +355 return False +356 +357 class MissingFieldError(Exception): +358 """ +359 An error indicating an entry is unparseable because it lacks required +360 fields. +361 """ +362 +363 missing_field: str +364 """The required field missing from the would-be entry.""" +365 message: str +366 """Message describing what caused this error.""" +367 +368 def __init__(self, missing_field): +369 self.missing_field = missing_field +370 self.message = "Entry from arXiv missing required info" +371 +372 def __repr__(self) -> str: +373 return '{}({})'.format( +374 _classname(self), +375 repr(self.missing_field) +376 )Example: logging
69 def __init__( - 70 self, - 71 entry_id: str, - 72 updated: datetime = _DEFAULT_TIME, - 73 published: datetime = _DEFAULT_TIME, - 74 title: str = "", - 75 authors: List['Result.Author'] = [], - 76 summary: str = "", - 77 comment: str = "", - 78 journal_ref: str = "", - 79 doi: str = "", - 80 primary_category: str = "", - 81 categories: List[str] = [], - 82 links: List['Result.Link'] = [], - 83 _raw: feedparser.FeedParserDict = None, - 84 ): - 85 """ - 86 Constructs an arXiv search result item. - 87 - 88 In most cases, prefer using `Result._from_feed_entry` to parsing and - 89 constructing `Result`s yourself. - 90 """ - 91 self.entry_id = entry_id - 92 self.updated = updated - 93 self.published = published - 94 self.title = title - 95 self.authors = authors - 96 self.summary = summary - 97 self.comment = comment - 98 self.journal_ref = journal_ref - 99 self.doi = doi -100 self.primary_category = primary_category -101 self.categories = categories -102 self.links = links -103 # Calculated members -104 self.pdf_url = Result._get_pdf_url(links) -105 # Debugging -106 self._raw = _raw +@@ -1776,7 +1778,7 @@71 def __init__( + 72 self, + 73 entry_id: str, + 74 updated: datetime = _DEFAULT_TIME, + 75 published: datetime = _DEFAULT_TIME, + 76 title: str = "", + 77 authors: List[Author] = [], + 78 summary: str = "", + 79 comment: str = "", + 80 journal_ref: str = "", + 81 doi: str = "", + 82 primary_category: str = "", + 83 categories: List[str] = [], + 84 links: List[Link] = [], + 85 _raw: feedparser.FeedParserDict = None, + 86 ): + 87 """ + 88 Constructs an arXiv search result item. + 89 + 90 In most cases, prefer using `Result._from_feed_entry` to parsing and + 91 constructing `Result`s yourself. + 92 """ + 93 self.entry_id = entry_id + 94 self.updated = updated + 95 self.published = published + 96 self.title = title + 97 self.authors = authors + 98 self.summary = summary + 99 self.comment = comment +100 self.journal_ref = journal_ref +101 self.doi = doi +102 self.primary_category = primary_category +103 self.categories = categories +104 self.links = links +105 # Calculated members +106 self.pdf_url = Result._get_pdf_url(links) +107 # Debugging +108 self._raw = _rawExample: logging
172 def get_short_id(self) -> str: -173 """ -174 Returns the short ID for this result. -175 -176 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, -177 `result.get_short_id()` returns `2107.05580v1`. -178 -179 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, -180 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March -181 2007 arXiv identifier format). -182 -183 For an explanation of the difference between arXiv's legacy and current -184 identifiers, see [Understanding the arXiv -185 identifier](https://arxiv.org/help/arxiv_identifier). -186 """ -187 return self.entry_id.split('arxiv.org/abs/')[-1] +@@ -1951,17 +1953,17 @@174 def get_short_id(self) -> str: +175 """ +176 Returns the short ID for this result. +177 +178 + If the result URL is `"http://arxiv.org/abs/2107.05580v1"`, +179 `result.get_short_id()` returns `2107.05580v1`. +180 +181 + If the result URL is `"http://arxiv.org/abs/quant-ph/0201082v1"`, +182 `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March +183 2007 arXiv identifier format). +184 +185 For an explanation of the difference between arXiv's legacy and current +186 identifiers, see [Understanding the arXiv +187 identifier](https://arxiv.org/help/arxiv_identifier). +188 """ +189 return self.entry_id.split('arxiv.org/abs/')[-1]Example: logging
198 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: -199 """ -200 Downloads the PDF for this result to the specified directory. -201 -202 The filename is generated by calling `to_filename(self)`. -203 """ -204 if not filename: -205 filename = self._get_default_filename() -206 path = os.path.join(dirpath, filename) -207 written_path, _ = urlretrieve(self.pdf_url, path) -208 return written_path +@@ -1983,20 +1985,20 @@200 def download_pdf(self, dirpath: str = './', filename: str = '') -> str: +201 """ +202 Downloads the PDF for this result to the specified directory. +203 +204 The filename is generated by calling `to_filename(self)`. +205 """ +206 if not filename: +207 filename = self._get_default_filename() +208 path = os.path.join(dirpath, filename) +209 written_path, _ = urlretrieve(self.pdf_url, path) +210 return written_pathExample: logging
210 def download_source(self, dirpath: str = './', filename: str = '') -> str: -211 """ -212 Downloads the source tarfile for this result to the specified -213 directory. -214 -215 The filename is generated by calling `to_filename(self)`. -216 """ -217 if not filename: -218 filename = self._get_default_filename('tar.gz') -219 path = os.path.join(dirpath, filename) -220 # Bodge: construct the source URL from the PDF URL. -221 source_url = self.pdf_url.replace('/pdf/', '/src/') -222 written_path, _ = urlretrieve(source_url, path) -223 return written_path +@@ -2020,44 +2022,44 @@212 def download_source(self, dirpath: str = './', filename: str = '') -> str: +213 """ +214 Downloads the source tarfile for this result to the specified +215 directory. +216 +217 The filename is generated by calling `to_filename(self)`. +218 """ +219 if not filename: +220 filename = self._get_default_filename('tar.gz') +221 path = os.path.join(dirpath, filename) +222 # Bodge: construct the source URL from the PDF URL. +223 source_url = self.pdf_url.replace('/pdf/', '/src/') +224 written_path, _ = urlretrieve(source_url, path) +225 return written_pathExample: logging
251 class Author(object): -252 """ -253 A light inner class for representing a result's authors. -254 """ -255 -256 name: str -257 """The author's name.""" -258 -259 def __init__(self, name: str): -260 """ -261 Constructs an `Author` with the specified name. -262 -263 In most cases, prefer using `Author._from_feed_author` to parsing -264 and constructing `Author`s yourself. -265 """ -266 self.name = name -267 -268 def _from_feed_author( -269 feed_author: feedparser.FeedParserDict -270 ) -> 'Result.Author': -271 """ -272 Constructs an `Author` with the name specified in an author object -273 from a feed entry. -274 -275 See usage in `Result._from_feed_entry`. -276 """ -277 return Result.Author(feed_author.name) -278 -279 def __str__(self) -> str: -280 return self.name -281 -282 def __repr__(self) -> str: -283 return '{}({})'.format(_classname(self), repr(self.name)) -284 -285 def __eq__(self, other) -> bool: -286 if isinstance(other, Result.Author): -287 return self.name == other.name -288 return False +@@ -2075,14 +2077,14 @@253 class Author(object): +254 """ +255 A light inner class for representing a result's authors. +256 """ +257 +258 name: str +259 """The author's name.""" +260 +261 def __init__(self, name: str): +262 """ +263 Constructs an `Author` with the specified name. +264 +265 In most cases, prefer using `Author._from_feed_author` to parsing +266 and constructing `Author`s yourself. +267 """ +268 self.name = name +269 +270 def _from_feed_author( +271 feed_author: feedparser.FeedParserDict +272 ) -> Result.Author: +273 """ +274 Constructs an `Author` with the name specified in an author object +275 from a feed entry. +276 +277 See usage in `Result._from_feed_entry`. +278 """ +279 return Result.Author(feed_author.name) +280 +281 def __str__(self) -> str: +282 return self.name +283 +284 def __repr__(self) -> str: +285 return '{}({})'.format(_classname(self), repr(self.name)) +286 +287 def __eq__(self, other) -> bool: +288 if isinstance(other, Result.Author): +289 return self.name == other.name +290 return FalseExample: logging
259 def __init__(self, name: str): -260 """ -261 Constructs an `Author` with the specified name. -262 -263 In most cases, prefer using `Author._from_feed_author` to parsing -264 and constructing `Author`s yourself. -265 """ -266 self.name = name +@@ -2119,70 +2121,70 @@261 def __init__(self, name: str): +262 """ +263 Constructs an `Author` with the specified name. +264 +265 In most cases, prefer using `Author._from_feed_author` to parsing +266 and constructing `Author`s yourself. +267 """ +268 self.name = nameExample: logging
290 class Link(object): -291 """ -292 A light inner class for representing a result's links. -293 """ -294 -295 href: str -296 """The link's `href` attribute.""" -297 title: str -298 """The link's title.""" -299 rel: str -300 """The link's relationship to the `Result`.""" -301 content_type: str -302 """The link's HTTP content type.""" -303 -304 def __init__( -305 self, -306 href: str, -307 title: str = None, -308 rel: str = None, -309 content_type: str = None -310 ): -311 """ -312 Constructs a `Link` with the specified link metadata. -313 -314 In most cases, prefer using `Link._from_feed_link` to parsing and -315 constructing `Link`s yourself. -316 """ -317 self.href = href -318 self.title = title -319 self.rel = rel -320 self.content_type = content_type -321 -322 def _from_feed_link( -323 feed_link: feedparser.FeedParserDict -324 ) -> 'Result.Link': -325 """ -326 Constructs a `Link` with link metadata specified in a link object -327 from a feed entry. -328 -329 See usage in `Result._from_feed_entry`. -330 """ -331 return Result.Link( -332 href=feed_link.href, -333 title=feed_link.get('title'), -334 rel=feed_link.get('rel'), -335 content_type=feed_link.get('content_type') -336 ) -337 -338 def __str__(self) -> str: -339 return self.href -340 -341 def __repr__(self) -> str: -342 return '{}({}, title={}, rel={}, content_type={})'.format( -343 _classname(self), -344 repr(self.href), -345 repr(self.title), -346 repr(self.rel), -347 repr(self.content_type) -348 ) -349 -350 def __eq__(self, other) -> bool: -351 if isinstance(other, Result.Link): -352 return self.href == other.href -353 return False +@@ -2200,23 +2202,23 @@292 class Link(object): +293 """ +294 A light inner class for representing a result's links. +295 """ +296 +297 href: str +298 """The link's `href` attribute.""" +299 title: str +300 """The link's title.""" +301 rel: str +302 """The link's relationship to the `Result`.""" +303 content_type: str +304 """The link's HTTP content type.""" +305 +306 def __init__( +307 self, +308 href: str, +309 title: str = None, +310 rel: str = None, +311 content_type: str = None +312 ): +313 """ +314 Constructs a `Link` with the specified link metadata. +315 +316 In most cases, prefer using `Link._from_feed_link` to parsing and +317 constructing `Link`s yourself. +318 """ +319 self.href = href +320 self.title = title +321 self.rel = rel +322 self.content_type = content_type +323 +324 def _from_feed_link( +325 feed_link: feedparser.FeedParserDict +326 ) -> Result.Link: +327 """ +328 Constructs a `Link` with link metadata specified in a link object +329 from a feed entry. +330 +331 See usage in `Result._from_feed_entry`. +332 """ +333 return Result.Link( +334 href=feed_link.href, +335 title=feed_link.get('title'), +336 rel=feed_link.get('rel'), +337 content_type=feed_link.get('content_type') +338 ) +339 +340 def __str__(self) -> str: +341 return self.href +342 +343 def __repr__(self) -> str: +344 return '{}({}, title={}, rel={}, content_type={})'.format( +345 _classname(self), +346 repr(self.href), +347 repr(self.title), +348 repr(self.rel), +349 repr(self.content_type) +350 ) +351 +352 def __eq__(self, other) -> bool: +353 if isinstance(other, Result.Link): +354 return self.href == other.href +355 return FalseExample: logging
304 def __init__( -305 self, -306 href: str, -307 title: str = None, -308 rel: str = None, -309 content_type: str = None -310 ): -311 """ -312 Constructs a `Link` with the specified link metadata. -313 -314 In most cases, prefer using `Link._from_feed_link` to parsing and -315 constructing `Link`s yourself. -316 """ -317 self.href = href -318 self.title = title -319 self.rel = rel -320 self.content_type = content_type +@@ -2292,26 +2294,26 @@306 def __init__( +307 self, +308 href: str, +309 title: str = None, +310 rel: str = None, +311 content_type: str = None +312 ): +313 """ +314 Constructs a `Link` with the specified link metadata. +315 +316 In most cases, prefer using `Link._from_feed_link` to parsing and +317 constructing `Link`s yourself. +318 """ +319 self.href = href +320 self.title = title +321 self.rel = rel +322 self.content_type = content_typeExample: logging
355 class MissingFieldError(Exception): -356 """ -357 An error indicating an entry is unparseable because it lacks required -358 fields. -359 """ -360 -361 missing_field: str -362 """The required field missing from the would-be entry.""" -363 message: str -364 """Message describing what caused this error.""" -365 -366 def __init__(self, missing_field): -367 self.missing_field = missing_field -368 self.message = "Entry from arXiv missing required info" -369 -370 def __repr__(self) -> str: -371 return '{}({})'.format( -372 _classname(self), -373 repr(self.missing_field) -374 ) +@@ -2330,9 +2332,9 @@357 class MissingFieldError(Exception): +358 """ +359 An error indicating an entry is unparseable because it lacks required +360 fields. +361 """ +362 +363 missing_field: str +364 """The required field missing from the would-be entry.""" +365 message: str +366 """Message describing what caused this error.""" +367 +368 def __init__(self, missing_field): +369 self.missing_field = missing_field +370 self.message = "Entry from arXiv missing required info" +371 +372 def __repr__(self) -> str: +373 return '{}({})'.format( +374 _classname(self), +375 repr(self.missing_field) +376 )Example: logging
366 def __init__(self, missing_field): -367 self.missing_field = missing_field -368 self.message = "Entry from arXiv missing required info" +@@ -2386,17 +2388,17 @@368 def __init__(self, missing_field): +369 self.missing_field = missing_field +370 self.message = "Entry from arXiv missing required info"Inherited Members
377class SortCriterion(Enum): -378 """ -379 A SortCriterion identifies a property by which search results can be -380 sorted. -381 -382 See [the arXiv API User's Manual: sort order for return -383 results](https://arxiv.org/help/api/user-manual#sort). -384 """ -385 Relevance = "relevance" -386 LastUpdatedDate = "lastUpdatedDate" -387 SubmittedDate = "submittedDate" +@@ -2466,16 +2468,16 @@379class SortCriterion(Enum): +380 """ +381 A SortCriterion identifies a property by which search results can be +382 sorted. +383 +384 See [the arXiv API User's Manual: sort order for return +385 results](https://arxiv.org/help/api/user-manual#sort). +386 """ +387 Relevance = "relevance" +388 LastUpdatedDate = "lastUpdatedDate" +389 SubmittedDate = "submittedDate"Inherited Members
390class SortOrder(Enum): -391 """ -392 A SortOrder indicates order in which search results are sorted according -393 to the specified arxiv.SortCriterion. -394 -395 See [the arXiv API User's Manual: sort order for return -396 results](https://arxiv.org/help/api/user-manual#sort). -397 """ -398 Ascending = "ascending" -399 Descending = "descending" +@@ -2533,108 +2535,108 @@392class SortOrder(Enum): +393 """ +394 A SortOrder indicates order in which search results are sorted according +395 to the specified arxiv.SortCriterion. +396 +397 See [the arXiv API User's Manual: sort order for return +398 results](https://arxiv.org/help/api/user-manual#sort). +399 """ +400 Ascending = "ascending" +401 Descending = "descending"Inherited Members
402class Search(object): -403 """ -404 A specification for a search of arXiv's database. -405 -406 To run a search, use `Search.run` to use a default client or `Client.run` -407 with a specific client. -408 """ -409 -410 query: str -411 """ -412 A query string. -413 -414 This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not -415 `au:del_maestro+AND+ti:checkerboard`. -416 -417 See [the arXiv API User's Manual: Details of Query -418 Construction](https://arxiv.org/help/api/user-manual#query_details). -419 """ -420 id_list: list -421 """ -422 A list of arXiv article IDs to which to limit the search. -423 -424 See [the arXiv API User's -425 Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) -426 for documentation of the interaction between `query` and `id_list`. -427 """ -428 max_results: float -429 """ -430 The maximum number of results to be returned in an execution of this -431 search. -432 -433 To fetch every result available, set `max_results=float('inf')`. -434 """ -435 sort_by: SortCriterion -436 """The sort criterion for results.""" -437 sort_order: SortOrder -438 """The sort order for results.""" -439 -440 def __init__( -441 self, -442 query: str = "", -443 id_list: List[str] = [], -444 max_results: float = float('inf'), -445 sort_by: SortCriterion = SortCriterion.Relevance, -446 sort_order: SortOrder = SortOrder.Descending -447 ): -448 """ -449 Constructs an arXiv API search with the specified criteria. -450 """ -451 self.query = query -452 self.id_list = id_list -453 self.max_results = max_results -454 self.sort_by = sort_by -455 self.sort_order = sort_order -456 -457 def __str__(self) -> str: -458 # TODO: develop a more informative string representation. -459 return repr(self) -460 -461 def __repr__(self) -> str: -462 return ( -463 '{}(query={}, id_list={}, max_results={}, sort_by={}, ' -464 'sort_order={})' -465 ).format( -466 _classname(self), -467 repr(self.query), -468 repr(self.id_list), -469 repr(self.max_results), -470 repr(self.sort_by), -471 repr(self.sort_order) -472 ) -473 -474 def _url_args(self) -> Dict[str, str]: -475 """ -476 Returns a dict of search parameters that should be included in an API -477 request for this search. -478 """ -479 return { -480 "search_query": self.query, -481 "id_list": ','.join(self.id_list), -482 "sortBy": self.sort_by.value, -483 "sortOrder": self.sort_order.value -484 } -485 -486 def get(self) -> Generator[Result, None, None]: -487 """ -488 **Deprecated** after 1.2.0; use `Search.results`. -489 """ -490 warnings.warn( -491 "The 'get' method is deprecated, use 'results' instead", -492 DeprecationWarning, -493 stacklevel=2 -494 ) -495 return self.results() -496 -497 def results(self, offset: int = 0) -> Generator[Result, None, None]: -498 """ -499 Executes the specified search using a default arXiv API client. -500 -501 For info on default behavior, see `Client.__init__` and `Client.results`. -502 """ -503 return Client().results(self, offset=offset) +@@ -2655,22 +2657,22 @@404class Search(object): +405 """ +406 A specification for a search of arXiv's database. +407 +408 To run a search, use `Search.run` to use a default client or `Client.run` +409 with a specific client. +410 """ +411 +412 query: str +413 """ +414 A query string. +415 +416 This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not +417 `au:del_maestro+AND+ti:checkerboard`. +418 +419 See [the arXiv API User's Manual: Details of Query +420 Construction](https://arxiv.org/help/api/user-manual#query_details). +421 """ +422 id_list: List[str] +423 """ +424 A list of arXiv article IDs to which to limit the search. +425 +426 See [the arXiv API User's +427 Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) +428 for documentation of the interaction between `query` and `id_list`. +429 """ +430 max_results: float +431 """ +432 The maximum number of results to be returned in an execution of this +433 search. +434 +435 To fetch every result available, set `max_results=float('inf')`. +436 """ +437 sort_by: SortCriterion +438 """The sort criterion for results.""" +439 sort_order: SortOrder +440 """The sort order for results.""" +441 +442 def __init__( +443 self, +444 query: str = "", +445 id_list: List[str] = [], +446 max_results: float = float('inf'), +447 sort_by: SortCriterion = SortCriterion.Relevance, +448 sort_order: SortOrder = SortOrder.Descending +449 ): +450 """ +451 Constructs an arXiv API search with the specified criteria. +452 """ +453 self.query = query +454 self.id_list = id_list +455 self.max_results = max_results +456 self.sort_by = sort_by +457 self.sort_order = sort_order +458 +459 def __str__(self) -> str: +460 # TODO: develop a more informative string representation. +461 return repr(self) +462 +463 def __repr__(self) -> str: +464 return ( +465 '{}(query={}, id_list={}, max_results={}, sort_by={}, ' +466 'sort_order={})' +467 ).format( +468 _classname(self), +469 repr(self.query), +470 repr(self.id_list), +471 repr(self.max_results), +472 repr(self.sort_by), +473 repr(self.sort_order) +474 ) +475 +476 def _url_args(self) -> Dict[str, str]: +477 """ +478 Returns a dict of search parameters that should be included in an API +479 request for this search. +480 """ +481 return { +482 "search_query": self.query, +483 "id_list": ','.join(self.id_list), +484 "sortBy": self.sort_by.value, +485 "sortOrder": self.sort_order.value +486 } +487 +488 def get(self) -> Generator[Result, None, None]: +489 """ +490 **Deprecated** after 1.2.0; use `Search.results`. +491 """ +492 warnings.warn( +493 "The 'get' method is deprecated, use 'results' instead", +494 DeprecationWarning, +495 stacklevel=2 +496 ) +497 return self.results() +498 +499 def results(self, offset: int = 0) -> Generator[Result, None, None]: +500 """ +501 Executes the specified search using a default arXiv API client. +502 +503 For info on default behavior, see `Client.__init__` and `Client.results`. +504 """ +505 return Client().results(self, offset=offset)Inherited Members
440 def __init__( -441 self, -442 query: str = "", -443 id_list: List[str] = [], -444 max_results: float = float('inf'), -445 sort_by: SortCriterion = SortCriterion.Relevance, -446 sort_order: SortOrder = SortOrder.Descending -447 ): -448 """ -449 Constructs an arXiv API search with the specified criteria. -450 """ -451 self.query = query -452 self.id_list = id_list -453 self.max_results = max_results -454 self.sort_by = sort_by -455 self.sort_order = sort_order +@@ -2700,7 +2702,7 @@442 def __init__( +443 self, +444 query: str = "", +445 id_list: List[str] = [], +446 max_results: float = float('inf'), +447 sort_by: SortCriterion = SortCriterion.Relevance, +448 sort_order: SortOrder = SortOrder.Descending +449 ): +450 """ +451 Constructs an arXiv API search with the specified criteria. +452 """ +453 self.query = query +454 self.id_list = id_list +455 self.max_results = max_results +456 self.sort_by = sort_by +457 self.sort_order = sort_orderInherited Members
486 def get(self) -> Generator[Result, None, None]: -487 """ -488 **Deprecated** after 1.2.0; use `Search.results`. -489 """ -490 warnings.warn( -491 "The 'get' method is deprecated, use 'results' instead", -492 DeprecationWarning, -493 stacklevel=2 -494 ) -495 return self.results() +@@ -2797,13 +2799,13 @@488 def get(self) -> Generator[Result, None, None]: +489 """ +490 **Deprecated** after 1.2.0; use `Search.results`. +491 """ +492 warnings.warn( +493 "The 'get' method is deprecated, use 'results' instead", +494 DeprecationWarning, +495 stacklevel=2 +496 ) +497 return self.results()Inherited Members
497 def results(self, offset: int = 0) -> Generator[Result, None, None]: -498 """ -499 Executes the specified search using a default arXiv API client. -500 -501 For info on default behavior, see `Client.__init__` and `Client.results`. -502 """ -503 return Client().results(self, offset=offset) +@@ -2826,199 +2828,199 @@499 def results(self, offset: int = 0) -> Generator[Result, None, None]: +500 """ +501 Executes the specified search using a default arXiv API client. +502 +503 For info on default behavior, see `Client.__init__` and `Client.results`. +504 """ +505 return Client().results(self, offset=offset)Inherited Members
506class Client(object): -507 """ -508 Specifies a strategy for fetching results from arXiv's API. -509 -510 This class obscures pagination and retry logic, and exposes -511 `Client.results`. -512 """ -513 -514 query_url_format = 'http://export.arxiv.org/api/query?{}' -515 """The arXiv query API endpoint format.""" -516 page_size: int -517 """Maximum number of results fetched in a single API request.""" -518 delay_seconds: int -519 """Number of seconds to wait between API requests.""" -520 num_retries: int -521 """Number of times to retry a failing API request.""" -522 _last_request_dt: datetime -523 -524 def __init__( -525 self, -526 page_size: int = 100, -527 delay_seconds: int = 3, -528 num_retries: int = 3 -529 ): -530 """ -531 Constructs an arXiv API client with the specified options. -532 -533 Note: the default parameters should provide a robust request strategy -534 for most use cases. Extreme page sizes, delays, or retries risk -535 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), -536 brittle behavior, and inconsistent results. -537 """ -538 self.page_size = page_size -539 self.delay_seconds = delay_seconds -540 self.num_retries = num_retries -541 self._last_request_dt = None -542 -543 def __str__(self) -> str: -544 # TODO: develop a more informative string representation. -545 return repr(self) -546 -547 def __repr__(self) -> str: -548 return '{}(page_size={}, delay_seconds={}, num_retries={})'.format( -549 _classname(self), -550 repr(self.page_size), -551 repr(self.delay_seconds), -552 repr(self.num_retries) -553 ) -554 -555 def get(self, search: Search) -> Generator[Result, None, None]: -556 """ -557 **Deprecated** after 1.2.0; use `Client.results`. -558 """ -559 warnings.warn( -560 "The 'get' method is deprecated, use 'results' instead", -561 DeprecationWarning, -562 stacklevel=2 -563 ) -564 return self.results(search) -565 -566 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: -567 """ -568 Uses this client configuration to fetch one page of the search results -569 at a time, yielding the parsed `Result`s, until `max_results` results -570 have been yielded or there are no more search results. -571 -572 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. +@@ -3039,24 +3041,24 @@508class Client(object): +509 """ +510 Specifies a strategy for fetching results from arXiv's API. +511 +512 This class obscures pagination and retry logic, and exposes +513 `Client.results`. +514 """ +515 +516 query_url_format = 'http://export.arxiv.org/api/query?{}' +517 """The arXiv query API endpoint format.""" +518 page_size: int +519 """Maximum number of results fetched in a single API request.""" +520 delay_seconds: int +521 """Number of seconds to wait between API requests.""" +522 num_retries: int +523 """Number of times to retry a failing API request.""" +524 _last_request_dt: datetime +525 +526 def __init__( +527 self, +528 page_size: int = 100, +529 delay_seconds: int = 3, +530 num_retries: int = 3 +531 ): +532 """ +533 Constructs an arXiv API client with the specified options. +534 +535 Note: the default parameters should provide a robust request strategy +536 for most use cases. Extreme page sizes, delays, or retries risk +537 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), +538 brittle behavior, and inconsistent results. +539 """ +540 self.page_size = page_size +541 self.delay_seconds = delay_seconds +542 self.num_retries = num_retries +543 self._last_request_dt = None +544 +545 def __str__(self) -> str: +546 # TODO: develop a more informative string representation. +547 return repr(self) +548 +549 def __repr__(self) -> str: +550 return '{}(page_size={}, delay_seconds={}, num_retries={})'.format( +551 _classname(self), +552 repr(self.page_size), +553 repr(self.delay_seconds), +554 repr(self.num_retries) +555 ) +556 +557 def get(self, search: Search) -> Generator[Result, None, None]: +558 """ +559 **Deprecated** after 1.2.0; use `Client.results`. +560 """ +561 warnings.warn( +562 "The 'get' method is deprecated, use 'results' instead", +563 DeprecationWarning, +564 stacklevel=2 +565 ) +566 return self.results(search) +567 +568 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: +569 """ +570 Uses this client configuration to fetch one page of the search results +571 at a time, yielding the parsed `Result`s, until `max_results` results +572 have been yielded or there are no more search results. 573 -574 Setting a nonzero `offset` discards leading records in the result set. -575 When `offset` is greater than or equal to `search.max_results`, the full -576 result set is discarded. -577 -578 For more on using generators, see -579 [Generators](https://wiki.python.org/moin/Generators). -580 """ -581 -582 # total_results may be reduced according to the feed's -583 # opensearch:totalResults value. -584 total_results = search.max_results -585 first_page = True -586 while offset < total_results: -587 page_size = min(self.page_size, search.max_results - offset) -588 logger.info("Requesting {} results at offset {}".format( -589 page_size, -590 offset, -591 )) -592 page_url = self._format_url(search, offset, page_size) -593 feed = self._parse_feed(page_url, first_page) -594 if first_page: -595 # NOTE: this is an ugly fix for a known bug. The totalresults -596 # value is set to 1 for results with zero entries. If that API -597 # bug is fixed, we can remove this conditional and always set -598 # `total_results = min(...)`. -599 if len(feed.entries) == 0: -600 logger.info("Got empty results; stopping generation") -601 total_results = 0 -602 else: -603 total_results = min( -604 total_results, -605 int(feed.feed.opensearch_totalresults) -606 ) -607 logger.info("Got first page; {} of {} results available".format( -608 total_results, -609 search.max_results -610 )) -611 # Subsequent pages are not the first page. -612 first_page = False -613 # Update offset for next request: account for received results. -614 offset += len(feed.entries) -615 # Yield query results until page is exhausted. -616 for entry in feed.entries: -617 try: -618 yield Result._from_feed_entry(entry) -619 except Result.MissingFieldError: -620 logger.warning("Skipping partial result") -621 continue -622 -623 def _format_url(self, search: Search, start: int, page_size: int) -> str: -624 """ -625 Construct a request API for search that returns up to `page_size` -626 results starting with the result at index `start`. -627 """ -628 url_args = search._url_args() -629 url_args.update({ -630 "start": start, -631 "max_results": page_size, -632 }) -633 return self.query_url_format.format(urlencode(url_args)) -634 -635 def _parse_feed( -636 self, -637 url: str, -638 first_page: bool = True -639 ) -> feedparser.FeedParserDict: -640 """ -641 Fetches the specified URL and parses it with feedparser. -642 -643 If a request fails or is unexpectedly empty, retries the request up to -644 `self.num_retries` times. -645 """ -646 # Invoke the recursive helper with initial available retries. -647 return self.__try_parse_feed( -648 url, -649 first_page=first_page, -650 retries_left=self.num_retries -651 ) -652 -653 def __try_parse_feed( -654 self, -655 url: str, -656 first_page: bool, -657 retries_left: int, -658 last_err: Exception = None, -659 ) -> feedparser.FeedParserDict: -660 """ -661 Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that -662 number of seconds has not passed since `_parse_feed` was last called, -663 sleeps until delay_seconds seconds have passed. -664 """ -665 retry = self.num_retries - retries_left -666 # If this call would violate the rate limit, sleep until it doesn't. -667 if self._last_request_dt is not None: -668 required = timedelta(seconds=self.delay_seconds) -669 since_last_request = datetime.now() - self._last_request_dt -670 if since_last_request < required: -671 to_sleep = (required - since_last_request).total_seconds() -672 logger.info("Sleeping for %f seconds", to_sleep) -673 time.sleep(to_sleep) -674 logger.info("Requesting page of results", extra={ -675 'url': url, -676 'first_page': first_page, -677 'retry': retry, -678 'last_err': last_err.message if last_err is not None else None, -679 }) -680 feed = feedparser.parse(url) -681 self._last_request_dt = datetime.now() -682 err = None -683 if feed.status != 200: -684 err = HTTPError(url, retry, feed) -685 elif len(feed.entries) == 0 and not first_page: -686 err = UnexpectedEmptyPageError(url, retry) -687 if err is not None: -688 if retries_left > 0: -689 return self.__try_parse_feed( -690 url, -691 first_page=first_page, -692 retries_left=retries_left-1, -693 last_err=err, -694 ) -695 # Feed was never returned in self.num_retries tries. Raise the last -696 # exception encountered. -697 raise err -698 return feed +574 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. +575 +576 Setting a nonzero `offset` discards leading records in the result set. +577 When `offset` is greater than or equal to `search.max_results`, the full +578 result set is discarded. +579 +580 For more on using generators, see +581 [Generators](https://wiki.python.org/moin/Generators). +582 """ +583 +584 # total_results may be reduced according to the feed's +585 # opensearch:totalResults value. +586 total_results = search.max_results +587 first_page = True +588 while offset < total_results: +589 page_size = min(self.page_size, search.max_results - offset) +590 logger.info("Requesting {} results at offset {}".format( +591 page_size, +592 offset, +593 )) +594 page_url = self._format_url(search, offset, page_size) +595 feed = self._parse_feed(page_url, first_page) +596 if first_page: +597 # NOTE: this is an ugly fix for a known bug. The totalresults +598 # value is set to 1 for results with zero entries. If that API +599 # bug is fixed, we can remove this conditional and always set +600 # `total_results = min(...)`. +601 if len(feed.entries) == 0: +602 logger.info("Got empty results; stopping generation") +603 total_results = 0 +604 else: +605 total_results = min( +606 total_results, +607 int(feed.feed.opensearch_totalresults) +608 ) +609 logger.info("Got first page; {} of {} results available".format( +610 total_results, +611 search.max_results +612 )) +613 # Subsequent pages are not the first page. +614 first_page = False +615 # Update offset for next request: account for received results. +616 offset += len(feed.entries) +617 # Yield query results until page is exhausted. +618 for entry in feed.entries: +619 try: +620 yield Result._from_feed_entry(entry) +621 except Result.MissingFieldError: +622 logger.warning("Skipping partial result") +623 continue +624 +625 def _format_url(self, search: Search, start: int, page_size: int) -> str: +626 """ +627 Construct a request API for search that returns up to `page_size` +628 results starting with the result at index `start`. +629 """ +630 url_args = search._url_args() +631 url_args.update({ +632 "start": start, +633 "max_results": page_size, +634 }) +635 return self.query_url_format.format(urlencode(url_args)) +636 +637 def _parse_feed( +638 self, +639 url: str, +640 first_page: bool = True +641 ) -> feedparser.FeedParserDict: +642 """ +643 Fetches the specified URL and parses it with feedparser. +644 +645 If a request fails or is unexpectedly empty, retries the request up to +646 `self.num_retries` times. +647 """ +648 # Invoke the recursive helper with initial available retries. +649 return self.__try_parse_feed( +650 url, +651 first_page=first_page, +652 retries_left=self.num_retries +653 ) +654 +655 def __try_parse_feed( +656 self, +657 url: str, +658 first_page: bool, +659 retries_left: int, +660 last_err: Exception = None, +661 ) -> feedparser.FeedParserDict: +662 """ +663 Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that +664 number of seconds has not passed since `_parse_feed` was last called, +665 sleeps until delay_seconds seconds have passed. +666 """ +667 retry = self.num_retries - retries_left +668 # If this call would violate the rate limit, sleep until it doesn't. +669 if self._last_request_dt is not None: +670 required = timedelta(seconds=self.delay_seconds) +671 since_last_request = datetime.now() - self._last_request_dt +672 if since_last_request < required: +673 to_sleep = (required - since_last_request).total_seconds() +674 logger.info("Sleeping for %f seconds", to_sleep) +675 time.sleep(to_sleep) +676 logger.info("Requesting page of results", extra={ +677 'url': url, +678 'first_page': first_page, +679 'retry': retry, +680 'last_err': last_err.message if last_err is not None else None, +681 }) +682 feed = feedparser.parse(url) +683 self._last_request_dt = datetime.now() +684 err = None +685 if feed.status != 200: +686 err = HTTPError(url, retry, feed) +687 elif len(feed.entries) == 0 and not first_page: +688 err = UnexpectedEmptyPageError(url, retry) +689 if err is not None: +690 if retries_left > 0: +691 return self.__try_parse_feed( +692 url, +693 first_page=first_page, +694 retries_left=retries_left-1, +695 last_err=err, +696 ) +697 # Feed was never returned in self.num_retries tries. Raise the last +698 # exception encountered. +699 raise err +700 return feedInherited Members
524 def __init__( -525 self, -526 page_size: int = 100, -527 delay_seconds: int = 3, -528 num_retries: int = 3 -529 ): -530 """ -531 Constructs an arXiv API client with the specified options. -532 -533 Note: the default parameters should provide a robust request strategy -534 for most use cases. Extreme page sizes, delays, or retries risk -535 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), -536 brittle behavior, and inconsistent results. -537 """ -538 self.page_size = page_size -539 self.delay_seconds = delay_seconds -540 self.num_retries = num_retries -541 self._last_request_dt = None +@@ -3134,16 +3136,16 @@526 def __init__( +527 self, +528 page_size: int = 100, +529 delay_seconds: int = 3, +530 num_retries: int = 3 +531 ): +532 """ +533 Constructs an arXiv API client with the specified options. +534 +535 Note: the default parameters should provide a robust request strategy +536 for most use cases. Extreme page sizes, delays, or retries risk +537 violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou), +538 brittle behavior, and inconsistent results. +539 """ +540 self.page_size = page_size +541 self.delay_seconds = delay_seconds +542 self.num_retries = num_retries +543 self._last_request_dt = NoneInherited Members
555 def get(self, search: Search) -> Generator[Result, None, None]: -556 """ -557 **Deprecated** after 1.2.0; use `Client.results`. -558 """ -559 warnings.warn( -560 "The 'get' method is deprecated, use 'results' instead", -561 DeprecationWarning, -562 stacklevel=2 -563 ) -564 return self.results(search) +@@ -3163,62 +3165,62 @@557 def get(self, search: Search) -> Generator[Result, None, None]: +558 """ +559 **Deprecated** after 1.2.0; use `Client.results`. +560 """ +561 warnings.warn( +562 "The 'get' method is deprecated, use 'results' instead", +563 DeprecationWarning, +564 stacklevel=2 +565 ) +566 return self.results(search)Inherited Members
566 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: -567 """ -568 Uses this client configuration to fetch one page of the search results -569 at a time, yielding the parsed `Result`s, until `max_results` results -570 have been yielded or there are no more search results. -571 -572 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. +@@ -3250,30 +3252,30 @@568 def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: +569 """ +570 Uses this client configuration to fetch one page of the search results +571 at a time, yielding the parsed `Result`s, until `max_results` results +572 have been yielded or there are no more search results. 573 -574 Setting a nonzero `offset` discards leading records in the result set. -575 When `offset` is greater than or equal to `search.max_results`, the full -576 result set is discarded. -577 -578 For more on using generators, see -579 [Generators](https://wiki.python.org/moin/Generators). -580 """ -581 -582 # total_results may be reduced according to the feed's -583 # opensearch:totalResults value. -584 total_results = search.max_results -585 first_page = True -586 while offset < total_results: -587 page_size = min(self.page_size, search.max_results - offset) -588 logger.info("Requesting {} results at offset {}".format( -589 page_size, -590 offset, -591 )) -592 page_url = self._format_url(search, offset, page_size) -593 feed = self._parse_feed(page_url, first_page) -594 if first_page: -595 # NOTE: this is an ugly fix for a known bug. The totalresults -596 # value is set to 1 for results with zero entries. If that API -597 # bug is fixed, we can remove this conditional and always set -598 # `total_results = min(...)`. -599 if len(feed.entries) == 0: -600 logger.info("Got empty results; stopping generation") -601 total_results = 0 -602 else: -603 total_results = min( -604 total_results, -605 int(feed.feed.opensearch_totalresults) -606 ) -607 logger.info("Got first page; {} of {} results available".format( -608 total_results, -609 search.max_results -610 )) -611 # Subsequent pages are not the first page. -612 first_page = False -613 # Update offset for next request: account for received results. -614 offset += len(feed.entries) -615 # Yield query results until page is exhausted. -616 for entry in feed.entries: -617 try: -618 yield Result._from_feed_entry(entry) -619 except Result.MissingFieldError: -620 logger.warning("Skipping partial result") -621 continue +574 If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`. +575 +576 Setting a nonzero `offset` discards leading records in the result set. +577 When `offset` is greater than or equal to `search.max_results`, the full +578 result set is discarded. +579 +580 For more on using generators, see +581 [Generators](https://wiki.python.org/moin/Generators). +582 """ +583 +584 # total_results may be reduced according to the feed's +585 # opensearch:totalResults value. +586 total_results = search.max_results +587 first_page = True +588 while offset < total_results: +589 page_size = min(self.page_size, search.max_results - offset) +590 logger.info("Requesting {} results at offset {}".format( +591 page_size, +592 offset, +593 )) +594 page_url = self._format_url(search, offset, page_size) +595 feed = self._parse_feed(page_url, first_page) +596 if first_page: +597 # NOTE: this is an ugly fix for a known bug. The totalresults +598 # value is set to 1 for results with zero entries. If that API +599 # bug is fixed, we can remove this conditional and always set +600 # `total_results = min(...)`. +601 if len(feed.entries) == 0: +602 logger.info("Got empty results; stopping generation") +603 total_results = 0 +604 else: +605 total_results = min( +606 total_results, +607 int(feed.feed.opensearch_totalresults) +608 ) +609 logger.info("Got first page; {} of {} results available".format( +610 total_results, +611 search.max_results +612 )) +613 # Subsequent pages are not the first page. +614 first_page = False +615 # Update offset for next request: account for received results. +616 offset += len(feed.entries) +617 # Yield query results until page is exhausted. +618 for entry in feed.entries: +619 try: +620 yield Result._from_feed_entry(entry) +621 except Result.MissingFieldError: +622 logger.warning("Skipping partial result") +623 continueInherited Members
701class ArxivError(Exception): -702 """This package's base Exception class.""" -703 -704 url: str -705 """The feed URL that could not be fetched.""" -706 retry: int -707 """ -708 The request try number which encountered this error; 0 for the initial try, -709 1 for the first retry, and so on. -710 """ -711 message: str -712 """Message describing what caused this error.""" -713 -714 def __init__(self, url: str, retry: int, message: str): -715 """ -716 Constructs an `ArxivError` encountered while fetching the specified URL. -717 """ -718 self.url = url -719 self.retry = retry -720 self.message = message -721 super().__init__(self.message) -722 -723 def __str__(self) -> str: -724 return '{} ({})'.format(self.message, self.url) +@@ -3291,14 +3293,14 @@703class ArxivError(Exception): +704 """This package's base Exception class.""" +705 +706 url: str +707 """The feed URL that could not be fetched.""" +708 retry: int +709 """ +710 The request try number which encountered this error; 0 for the initial try, +711 1 for the first retry, and so on. +712 """ +713 message: str +714 """Message describing what caused this error.""" +715 +716 def __init__(self, url: str, retry: int, message: str): +717 """ +718 Constructs an `ArxivError` encountered while fetching the specified URL. +719 """ +720 self.url = url +721 self.retry = retry +722 self.message = message +723 super().__init__(self.message) +724 +725 def __str__(self) -> str: +726 return '{} ({})'.format(self.message, self.url)Inherited Members
714 def __init__(self, url: str, retry: int, message: str): -715 """ -716 Constructs an `ArxivError` encountered while fetching the specified URL. -717 """ -718 self.url = url -719 self.retry = retry -720 self.message = message -721 super().__init__(self.message) +@@ -3368,29 +3370,29 @@716 def __init__(self, url: str, retry: int, message: str): +717 """ +718 Constructs an `ArxivError` encountered while fetching the specified URL. +719 """ +720 self.url = url +721 self.retry = retry +722 self.message = message +723 super().__init__(self.message)Inherited Members
727class UnexpectedEmptyPageError(ArxivError): -728 """ -729 An error raised when a page of results that should be non-empty is empty. -730 -731 This should never happen in theory, but happens sporadically due to -732 brittleness in the underlying arXiv API; usually resolved by retries. -733 -734 See `Client.results` for usage. -735 """ -736 def __init__(self, url: str, retry: int): -737 """ -738 Constructs an `UnexpectedEmptyPageError` encountered for the specified -739 API URL after `retry` tries. -740 """ -741 self.url = url -742 super().__init__(url, retry, "Page of results was unexpectedly empty") -743 -744 def __repr__(self) -> str: -745 return '{}({}, {})'.format( -746 _classname(self), -747 repr(self.url), -748 repr(self.retry) -749 ) +@@ -3413,13 +3415,13 @@729class UnexpectedEmptyPageError(ArxivError): +730 """ +731 An error raised when a page of results that should be non-empty is empty. +732 +733 This should never happen in theory, but happens sporadically due to +734 brittleness in the underlying arXiv API; usually resolved by retries. +735 +736 See `Client.results` for usage. +737 """ +738 def __init__(self, url: str, retry: int): +739 """ +740 Constructs an `UnexpectedEmptyPageError` encountered for the specified +741 API URL after `retry` tries. +742 """ +743 self.url = url +744 super().__init__(url, retry, "Page of results was unexpectedly empty") +745 +746 def __repr__(self) -> str: +747 return '{}({}, {})'.format( +748 _classname(self), +749 repr(self.url), +750 repr(self.retry) +751 )Inherited Members
736 def __init__(self, url: str, retry: int): -737 """ -738 Constructs an `UnexpectedEmptyPageError` encountered for the specified -739 API URL after `retry` tries. -740 """ -741 self.url = url -742 super().__init__(url, retry, "Page of results was unexpectedly empty") +@@ -3468,47 +3470,47 @@738 def __init__(self, url: str, retry: int): +739 """ +740 Constructs an `UnexpectedEmptyPageError` encountered for the specified +741 API URL after `retry` tries. +742 """ +743 self.url = url +744 super().__init__(url, retry, "Page of results was unexpectedly empty")Inherited Members
752class HTTPError(ArxivError): -753 """ -754 A non-200 status encountered while fetching a page of results. -755 -756 See `Client.results` for usage. -757 """ -758 -759 status: int -760 """The HTTP status reported by feedparser.""" -761 entry: feedparser.FeedParserDict -762 """The feed entry describing the error, if present.""" -763 -764 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): -765 """ -766 Constructs an `HTTPError` for the specified status code, encountered for -767 the specified API URL after `retry` tries. -768 """ -769 self.url = url -770 self.status = feed.status -771 # If the feed is valid and includes a single entry, trust it's an -772 # explanation. -773 if not feed.bozo and len(feed.entries) == 1: -774 self.entry = feed.entries[0] -775 else: -776 self.entry = None -777 super().__init__( -778 url, -779 retry, -780 "Page request resulted in HTTP {}: {}".format( -781 self.status, -782 self.entry.summary if self.entry else None, -783 ), -784 ) -785 -786 def __repr__(self) -> str: -787 return '{}({}, {}, {})'.format( -788 _classname(self), -789 repr(self.url), -790 repr(self.retry), -791 repr(self.status) -792 ) +@@ -3528,27 +3530,27 @@754class HTTPError(ArxivError): +755 """ +756 A non-200 status encountered while fetching a page of results. +757 +758 See `Client.results` for usage. +759 """ +760 +761 status: int +762 """The HTTP status reported by feedparser.""" +763 entry: feedparser.FeedParserDict +764 """The feed entry describing the error, if present.""" +765 +766 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): +767 """ +768 Constructs an `HTTPError` for the specified status code, encountered for +769 the specified API URL after `retry` tries. +770 """ +771 self.url = url +772 self.status = feed.status +773 # If the feed is valid and includes a single entry, trust it's an +774 # explanation. +775 if not feed.bozo and len(feed.entries) == 1: +776 self.entry = feed.entries[0] +777 else: +778 self.entry = None +779 super().__init__( +780 url, +781 retry, +782 "Page request resulted in HTTP {}: {}".format( +783 self.status, +784 self.entry.summary if self.entry else None, +785 ), +786 ) +787 +788 def __repr__(self) -> str: +789 return '{}({}, {}, {})'.format( +790 _classname(self), +791 repr(self.url), +792 repr(self.retry), +793 repr(self.status) +794 )Inherited Members
764 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): -765 """ -766 Constructs an `HTTPError` for the specified status code, encountered for -767 the specified API URL after `retry` tries. -768 """ -769 self.url = url -770 self.status = feed.status -771 # If the feed is valid and includes a single entry, trust it's an -772 # explanation. -773 if not feed.bozo and len(feed.entries) == 1: -774 self.entry = feed.entries[0] -775 else: -776 self.entry = None -777 super().__init__( -778 url, -779 retry, -780 "Page request resulted in HTTP {}: {}".format( -781 self.status, -782 self.entry.summary if self.entry else None, -783 ), -784 ) +diff --git a/setup.py b/setup.py index 947ba2a..1bd5565 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup -version = '1.4.4' +version = '1.4.5' with open('README.md', 'r') as fh: long_description = fh.read()766 def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict): +767 """ +768 Constructs an `HTTPError` for the specified status code, encountered for +769 the specified API URL after `retry` tries. +770 """ +771 self.url = url +772 self.status = feed.status +773 # If the feed is valid and includes a single entry, trust it's an +774 # explanation. +775 if not feed.bozo and len(feed.entries) == 1: +776 self.entry = feed.entries[0] +777 else: +778 self.entry = None +779 super().__init__( +780 url, +781 retry, +782 "Page request resulted in HTTP {}: {}".format( +783 self.status, +784 self.entry.summary if self.entry else None, +785 ), +786 )