Extract DOI from identifier

The sul_pub harvesting code was expecting the 'doi' to be set as a key on each publication, but it's stored in a list off the 'identifier' property. It needs to be extracted from there in order to get persisted to the CSV.
sul-dlss · Jun 20, 2024 · 225f002 · 225f002
1 parent e614313
commit 225f002
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 1 deletion.
diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
@@ -73,4 +73,16 @@ def harvest(host, key, since, limit):
                 more = False
                 break
 
-            yield {key: record[key] for key in record if key in sul_pub_fields}
+            pub = {key: record[key] for key in record if key in sul_pub_fields}
+            pub['doi'] = extract_doi(record)
+
+            yield pub
+
+
+def extract_doi(record):
+    for id in record.get('identifier'):
+        if id['type'] == 'doi':
+            return id['id']
+    return None
+
+
diff --git a/test/harvest/test_sul_pub.py b/test/harvest/test_sul_pub.py
@@ -23,3 +23,8 @@ def test_sul_pub_csv(tmpdir):
     df = pandas.read_csv(csv_file)
     assert len(df) == 2000
     assert "title" in df.columns
+
+    # there should be some dois in here
+    dois = df.doi[df.doi.notna()]
+    assert len(dois) > 1, 'there should be at least a few DOIs?'
+    assert not dois.iloc[0].startswith('http://'), 'DOI IDs not URLs'