From 1899346cf7c9347f0f3c4343658aaa53e9b8d187 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Wed, 10 Jul 2024 20:57:50 -0400 Subject: [PATCH] Limit the response size for ORCID lookup To avoid situations when looking up works for an ORCID are encoutering a Varnish error from the OpenAlex API when the response is too big, we can only request the DOI, which limits the size of the response. This means we can continue paging with `per_page=200`. Fixes #79 --- rialto_airflow/harvest/openalex.py | 4 +++- test/harvest/test_openalex.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index e1d9eeb..051fc6d 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -58,7 +58,9 @@ def dois_from_orcid(orcid: str, limit=None): # get all the works for the openalex author id work_count = 0 - for page in Works().filter(author={"id": author_id}).paginate(per_page=200): + for page in ( + Works().filter(author={"id": author_id}).select(["doi"]).paginate(per_page=200) + ): for pub in page: if pub.get("doi"): work_count += 1 diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index fdd054c..1335d94 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -98,3 +98,9 @@ def test_pyalex_urlencoding(): ) == 2 ), "we handle url URL encoding DOIs until pyalex does" + + +def test_pyalex_varnish_bug(): + # it seems like this author has a few records that are so big they blow out + # OpenAlex's Varnish index. See https://groups.google.com/u/1/g/openalex-community/c/hl09WRF3Naw + assert len(list(openalex.dois_from_orcid("0000-0003-3859-2905"))) > 270