From 225f0025225df1e44f0c89b449ac8c7bfe49dc45 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 20 Jun 2024 16:44:16 -0400 Subject: [PATCH] Extract DOI from identifier The sul_pub harvesting code was expecting the 'doi' to be set as a key on each publication, but it's stored in a list off the 'identifier' property. It needs to be extracted from there in order to get persisted to the CSV. --- rialto_airflow/harvest/sul_pub.py | 14 +++++++++++++- test/harvest/test_sul_pub.py | 5 +++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py index 1bb22b0..ad5d5af 100644 --- a/rialto_airflow/harvest/sul_pub.py +++ b/rialto_airflow/harvest/sul_pub.py @@ -73,4 +73,16 @@ def harvest(host, key, since, limit): more = False break - yield {key: record[key] for key in record if key in sul_pub_fields} + pub = {key: record[key] for key in record if key in sul_pub_fields} + pub['doi'] = extract_doi(record) + + yield pub + + +def extract_doi(record): + for id in record.get('identifier'): + if id['type'] == 'doi': + return id['id'] + return None + + diff --git a/test/harvest/test_sul_pub.py b/test/harvest/test_sul_pub.py index b022d3b..6e9f29b 100644 --- a/test/harvest/test_sul_pub.py +++ b/test/harvest/test_sul_pub.py @@ -23,3 +23,8 @@ def test_sul_pub_csv(tmpdir): df = pandas.read_csv(csv_file) assert len(df) == 2000 assert "title" in df.columns + + # there should be some dois in here + dois = df.doi[df.doi.notna()] + assert len(dois) > 1, 'there should be at least a few DOIs?' + assert not dois.iloc[0].startswith('http://'), 'DOI IDs not URLs'