Skip to content

Commit

Permalink
doi_set task
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jun 21, 2024
1 parent 4a6bbb3 commit 034c15c
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 5 deletions.
8 changes: 5 additions & 3 deletions rialto_airflow/dags/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from airflow.decorators import dag, task

from rialto_airflow.utils import create_snapshot_dir, rialto_authors_file
from rialto_airflow.harvest.sul_pub import sul_pub_csv
from rialto_airflow.harvest import dimensions, openalex
from rialto_airflow.harvest.sul_pub import sul_pub_csv
from rialto_airflow.harvest.doi_set import create_doi_set


data_dir = Variable.get("data_dir")
sul_pub_host = Variable.get("sul_pub_host")
Expand Down Expand Up @@ -73,7 +75,7 @@ def doi_set(dimensions, openalex, sul_pub):
Extract a unique list of DOIs from the dimensions doi-orcid dict,
the openalex doi-orcid dict, and the SUL-Pub publications.
"""
return True
return create_doi_set(dimensions, openalex, sul_pub)

@task()
def dimensions_harvest_doi(dois):
Expand Down Expand Up @@ -127,7 +129,7 @@ def publish(dataset):

openalex_orcid = openalex_harvest_orcid(authors_csv, snapshot_dir)

dois = doi_set(sul_pub, dimensions_orcid, openalex_orcid)
dois = doi_set(dimensions_orcid, openalex_orcid, sul_pub)

dimensions_doi = dimensions_harvest_doi(dois)

Expand Down
3 changes: 1 addition & 2 deletions rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ def dois_from_orcid(orcid):
if len(result["publications"]) == 1000:
logging.warning("Truncated results for ORCID %s", orcid)
for pub in result["publications"]:
doi = pub.get("doi")
if doi:
if pub.get("doi"):
yield pub["doi"]


Expand Down
30 changes: 30 additions & 0 deletions rialto_airflow/harvest/doi_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import csv
import pickle


def create_doi_set(dimensions: str, openalex: str, sul_pub_csv: str) -> list:
"""Get DOIs from each source and dedupe."""
dimensions_dois = dois_from_pickle(dimensions)
openalex_dois = dois_from_pickle(openalex)
sul_pub_dois = get_sul_pub_dois(sul_pub_csv)
unique_dois = list(set(dimensions_dois + openalex_dois + sul_pub_dois))

return unique_dois


def dois_from_pickle(pickle_file: str) -> dict:
"""Load a pickled dictionary of DOIs and ORCIDs from file."""
with open(pickle_file, "rb") as handle:
data = pickle.load(handle)

dois = list(data.keys())
return dois


def get_sul_pub_dois(sul_pub_csv: str) -> list:
"""Extract DOIs from sul_pub CSV and remove empty values."""
with open(sul_pub_csv, "r") as file:
reader = csv.DictReader(file)
doi_column = [row["doi"] for row in reader if row["doi"]]

return doi_column
51 changes: 51 additions & 0 deletions test/harvest/test_doi_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import csv
import pickle

import pytest

from rialto_airflow.harvest.doi_set import create_doi_set


@pytest.fixture
def dimensions_pickle(tmp_path):
data = {
"10.0000/1234": ["https://orcid.org/0000-0000-0000-0001"],
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0002"],
}
pickle_file = tmp_path / "dimensions.pickle"
with open(pickle_file, "wb") as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

return pickle_file


@pytest.fixture
def openalex_pickle(tmp_path):
data = {
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0001"],
"10.0000/zzzz": ["https://orcid.org/0000-0000-0000-0002"],
}
pickle_file = tmp_path / "openalex.pickle"
with open(pickle_file, "wb") as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

return pickle_file


@pytest.fixture
def sul_pub_csv(tmp_path):
fixture_file = tmp_path / "sul_pub.csv"
with open(fixture_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["sunetid", "title", "doi"])
writer.writerow(["author1", "A Publication", "10.0000/aaaa"])
writer.writerow(["author2", "A Research Article", "10.0000/1234"])
return fixture_file


def test_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv):
dois = create_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv)
assert len(dois) == 4
assert set(dois) == set(
["10.0000/1234", "10.0000/aaaa", "10.0000/cccc", "10.0000/zzzz"]
)

0 comments on commit 034c15c

Please sign in to comment.