Skip to content

Commit

Permalink
Handle unexpected value in sulpub year
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jul 17, 2024
1 parent 45deb75 commit 5945113
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def sulpub_df(sul_pub):
"""
Create a sulpub LazyFrame and rename columns
"""
df = pl.scan_csv(sul_pub)
df = pl.scan_csv(sul_pub, null_values="n/ a")
df = df.drop_nulls("doi")
df = df.with_columns(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
Expand Down
26 changes: 19 additions & 7 deletions test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,22 +117,30 @@ def sul_pubs_csv(tmp_path):
writer = csv.writer(csvfile)
header = ["authorship", "title", "year", "doi"]
writer.writerow(header)
writer.writerow(["[]", "A Publication", "2024", "10.0000/cccc"])
writer.writerow(["[]", "A Publication", 2024, "10.0000/cccc"])
writer.writerow(
[
"[]",
"A Research Article",
"2024",
2024,
]
)
writer.writerow(
[
"[]",
"A Published Research Article",
"2024",
2024,
"doi: 10.0000/dDdD",
]
)
writer.writerow(
[
"[]",
"A Published Research Article",
"n/ a",
"doi: 10.0000/eeee",
]
)
return fixture_file


Expand All @@ -158,23 +166,27 @@ def test_sulpub_df(sul_pubs_csv):
lazy_df = merge_pubs.sulpub_df(sul_pubs_csv)
assert isinstance(lazy_df, pl.lazyframe.frame.LazyFrame)
df = lazy_df.collect()
assert df.shape[0] == 2, "Row without a doi has been dropped"
assert df.shape[0] == 3, "Row without a doi has been dropped"
assert df.columns == [
"sul_pub_authorship",
"sul_pub_title",
"sul_pub_year",
"sul_pub_doi",
]
assert df["sul_pub_doi"].to_list() == ["10.0000/cccc", "10.0000/dddd"]
assert df["sul_pub_doi"].to_list() == [
"10.0000/cccc",
"10.0000/dddd",
"10.0000/eeee",
]


def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
output = tmp_path / "merged_pubs.parquet"
merge_pubs.merge(sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv, output)
assert output.is_file(), "output file has been created"
df = pl.read_parquet(output)
assert df.shape[0] == 4
assert df.shape[0] == 5
assert df.shape[1] == 25
assert set(df["doi"].to_list()) == set(
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd"]
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
)

0 comments on commit 5945113

Please sign in to comment.