Skip to content

Commit

Permalink
Make pub year columns strings
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jul 18, 2024
1 parent bfad4e8 commit df8a828
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def dimensions_pubs_df(dimensions_pubs):
# Create a LazyFrame of dimension pubs to avoid loading all data into memory
"""
# Polars is inferring volume is an integer, but it should be a string e.g. "97-B"
df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String})
df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String, "year": pl.String})
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
Expand All @@ -71,7 +71,7 @@ def openalex_pubs_df(openalex_pubs):
"""
Create an openalex pubs LazyFrame and rename columns
"""
df = pl.scan_csv(openalex_pubs)
df = pl.scan_csv(openalex_pubs, schema_overrides={"publication_year": pl.String})
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
Expand All @@ -86,7 +86,7 @@ def sulpub_df(sul_pub):
"""
Create a sulpub LazyFrame and rename columns
"""
df = pl.scan_csv(sul_pub, null_values="n/ a")
df = pl.scan_csv(sul_pub, schema_overrides={"year": pl.String})
df = df.drop_nulls("doi")
df = df.with_columns(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
Expand Down

0 comments on commit df8a828

Please sign in to comment.