From 225f0025225df1e44f0c89b449ac8c7bfe49dc45 Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Thu, 20 Jun 2024 16:44:16 -0400
Subject: [PATCH] Extract DOI from identifier

The sul_pub harvesting code was expecting the 'doi' to be set as a key
on each publication, but it's stored in a list off the 'identifier'
property. It needs to be extracted from there in order to get persisted
to the CSV.
---
 rialto_airflow/harvest/sul_pub.py | 14 +++++++++++++-
 test/harvest/test_sul_pub.py      |  5 +++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
index 1bb22b0..ad5d5af 100644
--- a/rialto_airflow/harvest/sul_pub.py
+++ b/rialto_airflow/harvest/sul_pub.py
@@ -73,4 +73,16 @@ def harvest(host, key, since, limit):
                 more = False
                 break
 
-            yield {key: record[key] for key in record if key in sul_pub_fields}
+            pub = {key: record[key] for key in record if key in sul_pub_fields}
+            pub['doi'] = extract_doi(record)
+
+            yield pub
+
+
+def extract_doi(record):
+    for id in record.get('identifier'):
+        if id['type'] == 'doi':
+            return id['id']
+    return None
+
+            
diff --git a/test/harvest/test_sul_pub.py b/test/harvest/test_sul_pub.py
index b022d3b..6e9f29b 100644
--- a/test/harvest/test_sul_pub.py
+++ b/test/harvest/test_sul_pub.py
@@ -23,3 +23,8 @@ def test_sul_pub_csv(tmpdir):
     df = pandas.read_csv(csv_file)
     assert len(df) == 2000
     assert "title" in df.columns
+
+    # there should be some dois in here
+    dois = df.doi[df.doi.notna()]
+    assert len(dois) > 1, 'there should be at least a few DOIs?'
+    assert not dois.iloc[0].startswith('http://'), 'DOI IDs not URLs'