Skip to content

Commit

Permalink
Ported over openalex code
Browse files Browse the repository at this point in the history
The code was modified to do a sleep between requests to hopefully stay with the
100k daily limit. It also was adjusted to page through the works
responses.
  • Loading branch information
edsu committed Jun 20, 2024
1 parent e614313 commit 4c8b57d
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 43 deletions.
21 changes: 5 additions & 16 deletions rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import re
import requests

from rialto_airflow.utils import invert_dict

dotenv.load_dotenv()

dimcli.login(
Expand Down Expand Up @@ -41,23 +43,10 @@ def dimensions_dois_from_orcid(orcid):
if len(result["publications"]) == 1000:
logging.warning("Truncated results for ORCID %s", orcid)
for pub in result["publications"]:
if pub.get("doi"):
yield "https://doi.org/" + pub["doi"]


def invert_dict(dict):
# Inverting the dictionary so that DOI is the common key for all tasks.
# This adds some complexity here but reduces complexity in downstream tasks.
original_values = []
for v in dict.values():
original_values.extend(v)
original_values = list(set(original_values))

inverted_dict = {}
for i in original_values:
inverted_dict[i] = [k for k, v in dict.items() if i in v]
doi = pub.get("doi")
if doi:
yield pub["doi"]

return inverted_dict


def dimensions_doi_orcids_dict(org_data_file, pickle_file, limit=None):
Expand Down
75 changes: 75 additions & 0 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import csv
import logging
import pickle
import time

import requests

from rialto_airflow.utils import invert_dict


def doi_orcid_pickle(authors_csv, pickle_file):
"""
Pass in the Authors CSV and generate a DOI -> ORCID mapping as a pickle file.
"""
with open(authors_csv, 'r') as csv_input:
orcid_dois = {}
for row in csv.DictReader(csv_input):
orcid = row['orcidid'].replace('https://orcid.org/', '')
orcid_dois[orcid] = list(dois_from_orcid(orcid))

with open(pickle_file, "wb") as handle:
pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL)

def dois_from_orcid(orcid: str):
"""
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
"""

# TODO: get a key so we don't have to sleep!
time.sleep(1)

orcid = f'https://orcid.org/{orcid}'
author_resp = requests.get(f"https://api.openalex.org/authors/{orcid}", allow_redirects=True)
if author_resp.status_code == 200:
author_id = author_resp.json()['id'].replace('https://openalex.org/', '')
for pub in works_from_author_id(author_id):
# not all publications have DOIs
doi = pub.get('doi')
if doi:
yield doi

def works_from_author_id(author_id, limit=None):
"""
Pass in the OpenAlex Author ID and get back an iterator of works.
"""
url = "https://api.openalex.org/works"
params = {
"filter": f"author.id:{author_id}",
"per_page": 200
}

count = 0
page = 0
has_more = True
while has_more:
page += 1
params['page'] = page
resp = requests.get(url, params)

if resp.status_code == 200:
# TODO: get a key so we don't have to sleep!
time.sleep(1)
results = resp.json().get("results")
if len(results) == 0:
has_more = False
else:
for result in results:
count += 1
if limit is not None and count > limit:
has_more = False
else:
yield result
else:
logging.error(f"encountered non-200 response: {url} {params}")
has_more = False
19 changes: 19 additions & 0 deletions rialto_airflow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,22 @@ def rialto_authors_orcids(rialto_authors_file):
for row in reader:
orcids.append(row[orcidid])
return orcids


def invert_dict(dict):
"""
Inverting the dictionary so that DOI is the common key for all tasks.
This adds some complexity here but reduces complexity in downstream tasks.
"""
original_values = []
for v in dict.values():
original_values.extend(v)
original_values = list(set(original_values))

inverted_dict = {}
for i in original_values:
inverted_dict[i] = [k for k, v in dict.items() if i in v]

return inverted_dict


35 changes: 9 additions & 26 deletions test/harvest/test_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pickle
import pytest

from rialto_airflow.harvest.dimensions import dimensions_doi_orcids_dict, invert_dict
from rialto_airflow.harvest.dimensions import dimensions_doi_orcids_dict


dotenv.load_dotenv()

Expand All @@ -16,34 +17,16 @@
@pytest.mark.skipif(no_auth, reason="no dimensions key")
def test_dimensions_doi_orcids_dict(tmpdir):
pickle_file = tmpdir / "dimensions.pickle"
dimensions_doi_orcids_dict("test/data/authors.csv", pickle_file, limit=5)
dimensions_doi_orcids_dict(
"test/data/authors.csv", pickle_file, limit=5
)
assert pickle_file.isfile()

with open(pickle_file, "rb") as handle:
doi_orcids = pickle.load(handle)

assert len(doi_orcids) > 0
assert doi_orcids["https://doi.org/10.1109/lra.2018.2890209"] == [
"0000-0002-0770-2940"
]


def test_invert_dict():
dict = {
"person_id1": ["pub_id1", "pub_id2", "pub_id3"],
"person_id2": ["pub_id2", "pub_id4", "pub_id5"],
"person_id3": ["pub_id5", "pub_id6", "pub_id7"],
}

inverted_dict = invert_dict(dict)
assert len(inverted_dict.items()) == 7
assert sorted(inverted_dict.keys()) == [
"pub_id1",
"pub_id2",
"pub_id3",
"pub_id4",
"pub_id5",
"pub_id6",
"pub_id7",
]
assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]
assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"]



34 changes: 34 additions & 0 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pickle
import re

from rialto_airflow.harvest import openalex


def test_dois_from_orcid():
dois = list(openalex.dois_from_orcid('0000-0002-1298-3089'))
assert len(dois) >= 54

def test_works_from_author_id():
# the dimensions api returns 200 publications at a time, so ensure that paging is working
# for Akihisa Inoue who has a lot of publications (> 4,000)
works = list(openalex.works_from_author_id('a5008412118', limit=600))
assert len(works) == 600, 'paging is limiting to 600 works'
assert len(set([work['id'] for work in works])) == 600, 'the works are unique'

def test_doi_orcid_pickle(tmp_path):
# authors_csv, pickle_file):
pickle_file = tmp_path / "openalex-doi-orcid.pickle"
openalex.doi_orcid_pickle('test/data/authors.csv', pickle_file)
assert pickle_file.is_file(), "created the pickle file"

mapping = pickle.load(pickle_file.open('rb'))
assert isinstance(mapping, dict)
assert len(mapping) > 0

doi = list(mapping.keys())[0]
assert '/' in doi

orcids = mapping[doi]
assert isinstance(orcids, list)
assert len(orcids) > 0
assert re.match(r'^\d+-\d+-\d+-\d+$', orcids[0])
25 changes: 24 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import csv
from pathlib import Path

import pytest
from rialto_airflow.utils import create_snapshot_dir, rialto_authors_orcids

from rialto_airflow.utils import create_snapshot_dir, invert_dict, rialto_authors_orcids


@pytest.fixture
Expand All @@ -25,3 +27,24 @@ def test_rialto_authors_orcids(tmp_path, authors_csv):
orcids = rialto_authors_orcids(authors_csv)
assert len(orcids) == 2
assert "https://orcid.org/0000-0000-0000-0001" in orcids


def test_invert_dict():
dict = {
"person_id1": ["pub_id1", "pub_id2", "pub_id3"],
"person_id2": ["pub_id2", "pub_id4", "pub_id5"],
"person_id3": ["pub_id5", "pub_id6", "pub_id7"],
}

inverted_dict = invert_dict(dict)
assert len(inverted_dict.items()) == 7
assert sorted(inverted_dict.keys()) == [
"pub_id1",
"pub_id2",
"pub_id3",
"pub_id4",
"pub_id5",
"pub_id6",
"pub_id7",
]
assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]

0 comments on commit 4c8b57d

Please sign in to comment.