Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create list of unique DOIs #44

Merged
merged 1 commit into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions rialto_airflow/dags/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from airflow.decorators import dag, task

from rialto_airflow.utils import create_snapshot_dir, rialto_authors_file
from rialto_airflow.harvest.sul_pub import sul_pub_csv
from rialto_airflow.harvest import dimensions, openalex
from rialto_airflow.harvest.sul_pub import sul_pub_csv
from rialto_airflow.harvest.doi_set import create_doi_set


data_dir = Variable.get("data_dir")
sul_pub_host = Variable.get("sul_pub_host")
Expand Down Expand Up @@ -73,7 +75,7 @@ def doi_set(dimensions, openalex, sul_pub):
Extract a unique list of DOIs from the dimensions doi-orcid dict,
the openalex doi-orcid dict, and the SUL-Pub publications.
"""
return True
return create_doi_set(dimensions, openalex, sul_pub)

@task()
def dimensions_harvest_doi(dois):
Expand Down Expand Up @@ -127,7 +129,7 @@ def publish(dataset):

openalex_orcid = openalex_harvest_orcid(authors_csv, snapshot_dir)

dois = doi_set(sul_pub, dimensions_orcid, openalex_orcid)
dois = doi_set(dimensions_orcid, openalex_orcid, sul_pub)

dimensions_doi = dimensions_harvest_doi(dois)

Expand Down
3 changes: 1 addition & 2 deletions rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ def dois_from_orcid(orcid):
if len(result["publications"]) == 1000:
logging.warning("Truncated results for ORCID %s", orcid)
for pub in result["publications"]:
doi = pub.get("doi")
if doi:
if pub.get("doi"):
yield pub["doi"]


Expand Down
30 changes: 30 additions & 0 deletions rialto_airflow/harvest/doi_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import csv
import pickle


def create_doi_set(dimensions: str, openalex: str, sul_pub_csv: str) -> list:
"""Get DOIs from each source and dedupe."""
dimensions_dois = dois_from_pickle(dimensions)
openalex_dois = dois_from_pickle(openalex)
sul_pub_dois = get_sul_pub_dois(sul_pub_csv)
unique_dois = list(set(dimensions_dois + openalex_dois + sul_pub_dois))

return unique_dois


def dois_from_pickle(pickle_file: str) -> dict:
"""Load a pickled dictionary of DOIs and ORCIDs from file."""
with open(pickle_file, "rb") as handle:
data = pickle.load(handle)

dois = list(data.keys())
return dois


def get_sul_pub_dois(sul_pub_csv: str) -> list:
"""Extract DOIs from sul_pub CSV and remove empty values."""
with open(sul_pub_csv, "r") as file:
reader = csv.DictReader(file)
doi_column = [row["doi"] for row in reader if row["doi"]]

return doi_column
51 changes: 51 additions & 0 deletions test/harvest/test_doi_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import csv
import pickle

import pytest

from rialto_airflow.harvest.doi_set import create_doi_set


@pytest.fixture
def dimensions_pickle(tmp_path):
data = {
"10.0000/1234": ["https://orcid.org/0000-0000-0000-0001"],
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0002"],
}
pickle_file = tmp_path / "dimensions.pickle"
with open(pickle_file, "wb") as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

return pickle_file


@pytest.fixture
def openalex_pickle(tmp_path):
data = {
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0001"],
"10.0000/zzzz": ["https://orcid.org/0000-0000-0000-0002"],
}
pickle_file = tmp_path / "openalex.pickle"
with open(pickle_file, "wb") as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

return pickle_file


@pytest.fixture
def sul_pub_csv(tmp_path):
fixture_file = tmp_path / "sul_pub.csv"
with open(fixture_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["sunetid", "title", "doi"])
writer.writerow(["author1", "A Publication", "10.0000/aaaa"])
writer.writerow(["author2", "A Research Article", "10.0000/1234"])
return fixture_file


def test_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv):
dois = create_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv)
assert len(dois) == 4
assert set(dois) == set(
["10.0000/1234", "10.0000/aaaa", "10.0000/cccc", "10.0000/zzzz"]
)
Loading