-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The code was modified to do a sleep between requests to hopefully stay with the 100k daily limit. It also was adjusted to page through the works responses.
- Loading branch information
Showing
6 changed files
with
166 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import csv | ||
import logging | ||
import pickle | ||
import time | ||
|
||
import requests | ||
|
||
from rialto_airflow.utils import invert_dict | ||
|
||
|
||
def doi_orcid_pickle(authors_csv, pickle_file): | ||
""" | ||
Pass in the Authors CSV and generate a DOI -> ORCID mapping as a pickle file. | ||
""" | ||
with open(authors_csv, 'r') as csv_input: | ||
orcid_dois = {} | ||
for row in csv.DictReader(csv_input): | ||
orcid = row['orcidid'].replace('https://orcid.org/', '') | ||
orcid_dois[orcid] = list(dois_from_orcid(orcid)) | ||
|
||
with open(pickle_file, "wb") as handle: | ||
pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
def dois_from_orcid(orcid: str): | ||
""" | ||
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person. | ||
""" | ||
|
||
# TODO: get a key so we don't have to sleep! | ||
time.sleep(1) | ||
|
||
orcid = f'https://orcid.org/{orcid}' | ||
author_resp = requests.get(f"https://api.openalex.org/authors/{orcid}", allow_redirects=True) | ||
if author_resp.status_code == 200: | ||
author_id = author_resp.json()['id'].replace('https://openalex.org/', '') | ||
for pub in works_from_author_id(author_id): | ||
# not all publications have DOIs | ||
doi = pub.get('doi') | ||
if doi: | ||
yield doi | ||
|
||
def works_from_author_id(author_id, limit=None): | ||
""" | ||
Pass in the OpenAlex Author ID and get back an iterator of works. | ||
""" | ||
url = "https://api.openalex.org/works" | ||
params = { | ||
"filter": f"author.id:{author_id}", | ||
"per_page": 200 | ||
} | ||
|
||
count = 0 | ||
page = 0 | ||
has_more = True | ||
while has_more: | ||
page += 1 | ||
params['page'] = page | ||
resp = requests.get(url, params) | ||
|
||
if resp.status_code == 200: | ||
# TODO: get a key so we don't have to sleep! | ||
time.sleep(1) | ||
results = resp.json().get("results") | ||
if len(results) == 0: | ||
has_more = False | ||
else: | ||
for result in results: | ||
count += 1 | ||
if limit is not None and count > limit: | ||
has_more = False | ||
else: | ||
yield result | ||
else: | ||
logging.error(f"encountered non-200 response: {url} {params}") | ||
has_more = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import pickle | ||
import re | ||
|
||
from rialto_airflow.harvest import openalex | ||
|
||
|
||
def test_dois_from_orcid(): | ||
dois = list(openalex.dois_from_orcid('0000-0002-1298-3089')) | ||
assert len(dois) >= 54 | ||
|
||
def test_works_from_author_id(): | ||
# the dimensions api returns 200 publications at a time, so ensure that paging is working | ||
# for Akihisa Inoue who has a lot of publications (> 4,000) | ||
works = list(openalex.works_from_author_id('a5008412118', limit=600)) | ||
assert len(works) == 600, 'paging is limiting to 600 works' | ||
assert len(set([work['id'] for work in works])) == 600, 'the works are unique' | ||
|
||
def test_doi_orcid_pickle(tmp_path): | ||
# authors_csv, pickle_file): | ||
pickle_file = tmp_path / "openalex-doi-orcid.pickle" | ||
openalex.doi_orcid_pickle('test/data/authors.csv', pickle_file) | ||
assert pickle_file.is_file(), "created the pickle file" | ||
|
||
mapping = pickle.load(pickle_file.open('rb')) | ||
assert isinstance(mapping, dict) | ||
assert len(mapping) > 0 | ||
|
||
doi = list(mapping.keys())[0] | ||
assert '/' in doi | ||
|
||
orcids = mapping[doi] | ||
assert isinstance(orcids, list) | ||
assert len(orcids) > 0 | ||
assert re.match(r'^\d+-\d+-\d+-\d+$', orcids[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters