Ported over openalex code

The code was modified to do a sleep between requests to hopefully stay with the 100k daily limit. It also was adjusted to page through the works responses.
sul-dlss · Jun 20, 2024 · 4c8b57d · 4c8b57d
1 parent e614313
commit 4c8b57d
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 43 deletions.
diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py
@@ -8,6 +8,8 @@
 import re
 import requests
 
+from rialto_airflow.utils import invert_dict
+
 dotenv.load_dotenv()
 
 dimcli.login(
@@ -41,23 +43,10 @@ def dimensions_dois_from_orcid(orcid):
     if len(result["publications"]) == 1000:
         logging.warning("Truncated results for ORCID %s", orcid)
     for pub in result["publications"]:
-        if pub.get("doi"):
-            yield "https://doi.org/" + pub["doi"]
-
-
-def invert_dict(dict):
-    # Inverting the dictionary so that DOI is the common key for all tasks.
-    # This adds some complexity here but reduces complexity in downstream tasks.
-    original_values = []
-    for v in dict.values():
-        original_values.extend(v)
-    original_values = list(set(original_values))
-
-    inverted_dict = {}
-    for i in original_values:
-        inverted_dict[i] = [k for k, v in dict.items() if i in v]
+        doi = pub.get("doi")
+        if doi:
+            yield pub["doi"]
 
-    return inverted_dict
 
 
 def dimensions_doi_orcids_dict(org_data_file, pickle_file, limit=None):

diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
@@ -0,0 +1,75 @@
+import csv
+import logging
+import pickle
+import time
+
+import requests
+
+from rialto_airflow.utils import invert_dict
+
+
+def doi_orcid_pickle(authors_csv, pickle_file):
+    """
+    Pass in the Authors CSV and generate a DOI -> ORCID mapping as a pickle file.
+    """
+    with open(authors_csv, 'r') as csv_input:
+        orcid_dois = {}
+        for row in csv.DictReader(csv_input):
+            orcid = row['orcidid'].replace('https://orcid.org/', '')
+            orcid_dois[orcid] = list(dois_from_orcid(orcid))
+
+    with open(pickle_file, "wb") as handle:
+        pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+def dois_from_orcid(orcid: str):
+    """
+    Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
+    """
+
+    # TODO: get a key so we don't have to sleep!
+    time.sleep(1)
+
+    orcid = f'https://orcid.org/{orcid}'
+    author_resp = requests.get(f"https://api.openalex.org/authors/{orcid}", allow_redirects=True)
+    if author_resp.status_code == 200:
+        author_id = author_resp.json()['id'].replace('https://openalex.org/', '')
+        for pub in works_from_author_id(author_id):
+            # not all publications have DOIs
+            doi = pub.get('doi')
+            if doi:
+                yield doi
+
+def works_from_author_id(author_id, limit=None):
+    """
+    Pass in the OpenAlex Author ID and get back an iterator of works.
+    """
+    url = "https://api.openalex.org/works"
+    params = {
+        "filter": f"author.id:{author_id}",
+        "per_page": 200
+    }
+
+    count = 0
+    page = 0
+    has_more = True
+    while has_more:
+        page += 1
+        params['page'] = page
+        resp = requests.get(url, params)
+
+        if resp.status_code == 200:
+            # TODO: get a key so we don't have to sleep!
+            time.sleep(1) 
+            results = resp.json().get("results")
+            if len(results) == 0:
+                has_more = False
+            else:
+                for result in results:
+                    count += 1
+                    if limit is not None and count > limit:
+                        has_more = False
+                    else:
+                        yield result
+        else:
+            logging.error(f"encountered non-200 response: {url} {params}")
+            has_more = False
diff --git a/rialto_airflow/utils.py b/rialto_airflow/utils.py
@@ -33,3 +33,22 @@ def rialto_authors_orcids(rialto_authors_file):
         for row in reader:
             orcids.append(row[orcidid])
     return orcids
+
+
+def invert_dict(dict):
+    """
+    Inverting the dictionary so that DOI is the common key for all tasks.
+    This adds some complexity here but reduces complexity in downstream tasks.
+    """
+    original_values = []
+    for v in dict.values():
+        original_values.extend(v)
+    original_values = list(set(original_values))
+
+    inverted_dict = {}
+    for i in original_values:
+        inverted_dict[i] = [k for k, v in dict.items() if i in v]
+
+    return inverted_dict
+
+
diff --git a/test/harvest/test_dimensions.py b/test/harvest/test_dimensions.py
@@ -3,7 +3,8 @@
 import pickle
 import pytest
 
-from rialto_airflow.harvest.dimensions import dimensions_doi_orcids_dict, invert_dict
+from rialto_airflow.harvest.dimensions import dimensions_doi_orcids_dict
+
 
 dotenv.load_dotenv()
 
@@ -16,34 +17,16 @@
 @pytest.mark.skipif(no_auth, reason="no dimensions key")
 def test_dimensions_doi_orcids_dict(tmpdir):
     pickle_file = tmpdir / "dimensions.pickle"
-    dimensions_doi_orcids_dict("test/data/authors.csv", pickle_file, limit=5)
+    dimensions_doi_orcids_dict(
+        "test/data/authors.csv", pickle_file, limit=5
+    )
     assert pickle_file.isfile()
 
     with open(pickle_file, "rb") as handle:
         doi_orcids = pickle.load(handle)
 
     assert len(doi_orcids) > 0
-    assert doi_orcids["https://doi.org/10.1109/lra.2018.2890209"] == [
-        "0000-0002-0770-2940"
-    ]
-
-
-def test_invert_dict():
-    dict = {
-        "person_id1": ["pub_id1", "pub_id2", "pub_id3"],
-        "person_id2": ["pub_id2", "pub_id4", "pub_id5"],
-        "person_id3": ["pub_id5", "pub_id6", "pub_id7"],
-    }
-
-    inverted_dict = invert_dict(dict)
-    assert len(inverted_dict.items()) == 7
-    assert sorted(inverted_dict.keys()) == [
-        "pub_id1",
-        "pub_id2",
-        "pub_id3",
-        "pub_id4",
-        "pub_id5",
-        "pub_id6",
-        "pub_id7",
-    ]
-    assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]
+    assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"]
+
+
+
diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py
@@ -0,0 +1,34 @@
+import pickle
+import re
+
+from rialto_airflow.harvest import openalex
+
+
+def test_dois_from_orcid():
+    dois = list(openalex.dois_from_orcid('0000-0002-1298-3089'))
+    assert len(dois) >= 54
+
+def test_works_from_author_id():
+    # the dimensions api returns 200 publications at a time, so ensure that paging is working 
+    # for Akihisa Inoue who has a lot of publications (> 4,000)
+    works = list(openalex.works_from_author_id('a5008412118', limit=600))
+    assert len(works) == 600, 'paging is limiting to 600 works'
+    assert len(set([work['id'] for work in works])) == 600, 'the works are unique'
+
+def test_doi_orcid_pickle(tmp_path):
+    # authors_csv, pickle_file):
+    pickle_file = tmp_path / "openalex-doi-orcid.pickle"
+    openalex.doi_orcid_pickle('test/data/authors.csv', pickle_file)
+    assert pickle_file.is_file(), "created the pickle file"
+
+    mapping = pickle.load(pickle_file.open('rb'))
+    assert isinstance(mapping, dict)
+    assert len(mapping) > 0
+
+    doi = list(mapping.keys())[0]
+    assert '/' in doi
+
+    orcids = mapping[doi]
+    assert isinstance(orcids, list)
+    assert len(orcids) > 0
+    assert re.match(r'^\d+-\d+-\d+-\d+$', orcids[0])
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -1,7 +1,9 @@
 import csv
 from pathlib import Path
+
 import pytest
-from rialto_airflow.utils import create_snapshot_dir, rialto_authors_orcids
+
+from rialto_airflow.utils import create_snapshot_dir, invert_dict, rialto_authors_orcids
 
 
 @pytest.fixture
@@ -25,3 +27,24 @@ def test_rialto_authors_orcids(tmp_path, authors_csv):
     orcids = rialto_authors_orcids(authors_csv)
     assert len(orcids) == 2
     assert "https://orcid.org/0000-0000-0000-0001" in orcids
+
+
+def test_invert_dict():
+    dict = {
+        "person_id1": ["pub_id1", "pub_id2", "pub_id3"],
+        "person_id2": ["pub_id2", "pub_id4", "pub_id5"],
+        "person_id3": ["pub_id5", "pub_id6", "pub_id7"],
+    }
+
+    inverted_dict = invert_dict(dict)
+    assert len(inverted_dict.items()) == 7
+    assert sorted(inverted_dict.keys()) == [
+        "pub_id1",
+        "pub_id2",
+        "pub_id3",
+        "pub_id4",
+        "pub_id5",
+        "pub_id6",
+        "pub_id7",
+    ]
+    assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]