-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
87 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import csv | ||
import pickle | ||
|
||
|
||
def create_doi_set(dimensions: str, openalex: str, sul_pub_csv: str) -> list: | ||
"""Get DOIs from each source and dedupe.""" | ||
dimensions_dois = dois_from_pickle(dimensions) | ||
openalex_dois = dois_from_pickle(openalex) | ||
sul_pub_dois = get_sul_pub_dois(sul_pub_csv) | ||
unique_dois = list(set(dimensions_dois + openalex_dois + sul_pub_dois)) | ||
|
||
return unique_dois | ||
|
||
|
||
def dois_from_pickle(pickle_file: str) -> dict: | ||
"""Load a pickled dictionary of DOIs and ORCIDs from file.""" | ||
with open(pickle_file, "rb") as handle: | ||
data = pickle.load(handle) | ||
|
||
dois = list(data.keys()) | ||
return dois | ||
|
||
|
||
def get_sul_pub_dois(sul_pub_csv: str) -> list: | ||
"""Extract DOIs from sul_pub CSV and remove empty values.""" | ||
with open(sul_pub_csv, "r") as file: | ||
reader = csv.DictReader(file) | ||
doi_column = [row["doi"] for row in reader if row["doi"]] | ||
|
||
return doi_column |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import csv | ||
import pickle | ||
|
||
import pytest | ||
|
||
from rialto_airflow.harvest.doi_set import create_doi_set | ||
|
||
|
||
@pytest.fixture | ||
def dimensions_pickle(tmp_path): | ||
data = { | ||
"10.0000/1234": ["https://orcid.org/0000-0000-0000-0001"], | ||
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0002"], | ||
} | ||
pickle_file = tmp_path / "dimensions.pickle" | ||
with open(pickle_file, "wb") as handle: | ||
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
return pickle_file | ||
|
||
|
||
@pytest.fixture | ||
def openalex_pickle(tmp_path): | ||
data = { | ||
"10.0000/cccc": ["https://orcid.org/0000-0000-0000-0001"], | ||
"10.0000/zzzz": ["https://orcid.org/0000-0000-0000-0002"], | ||
} | ||
pickle_file = tmp_path / "openalex.pickle" | ||
with open(pickle_file, "wb") as handle: | ||
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
return pickle_file | ||
|
||
|
||
@pytest.fixture | ||
def sul_pub_csv(tmp_path): | ||
fixture_file = tmp_path / "sul_pub.csv" | ||
with open(fixture_file, "w", newline="") as csvfile: | ||
writer = csv.writer(csvfile) | ||
writer.writerow(["sunetid", "title", "doi"]) | ||
writer.writerow(["author1", "A Publication", "10.0000/aaaa"]) | ||
writer.writerow(["author2", "A Research Article", "10.0000/1234"]) | ||
return fixture_file | ||
|
||
|
||
def test_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv): | ||
dois = create_doi_set(dimensions_pickle, openalex_pickle, sul_pub_csv) | ||
assert len(dois) == 4 | ||
assert set(dois) == set( | ||
["10.0000/1234", "10.0000/aaaa", "10.0000/cccc", "10.0000/zzzz"] | ||
) |