Extract DOI from identifier

The sul_pub harvesting code was expecting the 'doi' to be set as a key on each publication, but it's stored in a list off the 'identifier' property. It needs to be extracted from there in order to get persisted to the CSV.
sul-dlss · Jun 20, 2024 · 7b1c248 · 7b1c248
1 parent e614313
commit 7b1c248
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
@@ -73,4 +73,14 @@ def harvest(host, key, since, limit):
                 more = False
                 break
 
-            yield {key: record[key] for key in record if key in sul_pub_fields}
+            pub = {key: record[key] for key in record if key in sul_pub_fields}
+            pub["doi"] = extract_doi(record)
+
+            yield pub
+
+
+def extract_doi(record):
+    for id in record.get("identifier"):
+        if id["type"] == "doi":
+            return id["id"]
+    return None
diff --git a/test/harvest/test_sul_pub.py b/test/harvest/test_sul_pub.py
@@ -23,3 +23,8 @@ def test_sul_pub_csv(tmpdir):
     df = pandas.read_csv(csv_file)
     assert len(df) == 2000
     assert "title" in df.columns
+
+    # there should be some dois in here
+    dois = df.doi[df.doi.notna()]
+    assert len(dois) > 1, "there should be at least a few DOIs?"
+    assert not dois.iloc[0].startswith("http://"), "DOI IDs not URLs"