Skip to content

Commit

Permalink
Only harvest approved publications from sul_pub
Browse files Browse the repository at this point in the history
We are currently saving *all* publications from sul_pub in our
`sul_pub.csv`, but we only want to include publications that have been
*approved* by a Stanford author. Including all publications was causing
there to be more publications in our publications.parquet than there
were contributions in our contributions.parquet file.

Fixes #115
  • Loading branch information
edsu committed Feb 11, 2025
1 parent cea7324 commit 1f13771
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
13 changes: 13 additions & 0 deletions rialto_airflow/harvest/sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def harvest(host, key, since, limit):
more = False

for record in records:
if not approved(record):
continue

record_count += 1
if limit is not None and record_count > limit:
logging.info(f"stopping with limit={limit}")
Expand All @@ -71,3 +74,13 @@ def extract_doi(record):
if id.get("type") == "doi" and "id" in id:
return id["id"].replace("https://doi.org/", "")
return None


def approved(pub):
"""
Returns True if at least one author has approved the publication, and False if not.
"""
for authorship in pub["authorship"]:
if authorship["status"] == "approved":
return True
return False
9 changes: 9 additions & 0 deletions test/harvest/test_sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,12 @@ def test_sul_pub_csv(tmpdir):
dois = df.doi[df.doi.notna()]
assert len(dois) > 1, "there should be at least a few DOIs?"
assert not dois.iloc[0].startswith("http://"), "DOI IDs not URLs"

# all the publications should be approved by at least one author
for _, pub in df.iterrows():
approved = False
# the value in the authorship column is a serialized Python dictionary
for authorship in eval(pub["authorship"]):
if authorship["status"] == "approved":
approved = True
assert approved is True, f"sulpubid={pub['sulpubid']} is marked approved"

0 comments on commit 1f13771

Please sign in to comment.