Skip to content

Commit

Permalink
Ran ruff format
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed Jun 17, 2024
1 parent e08692f commit 497e2b1
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
7 changes: 4 additions & 3 deletions rialto_airflow/dags/update_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@
sul_pub_host = Variable.get("sul_pub_host")
sul_pub_key = Variable.get("sul_pub_key")


@dag(
schedule=None,
start_date=datetime.datetime(2024, 1, 1),
catchup=False,
)
def update_data():

@task(multiple_outputs=True)
def setup():
"""
Setup the data directory to write to and determine the last harvest.
"""
return {
"last_harvest": last_harvest(),
"snapshot_dir": create_snapshot_dir(data_dir)
"snapshot_dir": create_snapshot_dir(data_dir),
}

@task()
Expand All @@ -42,7 +42,7 @@ def fetch_sul_pub(last_harvest, snapshot_dir):
def extract_doi(sulpub):
"""
Extract a unique list of DOIs from the new publications data.
"""
"""
return True

@task()
Expand Down Expand Up @@ -97,4 +97,5 @@ def publish(dataset):
dataset = create_dataset(pubs, contribs)
publish(dataset)


update_data()
8 changes: 4 additions & 4 deletions rialto_airflow/harvest/sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,23 @@ def harvest(host, key, since, limit):

http_headers = {"CAPKEY": key}

params = { "per": 1000 }
params = {"per": 1000}
if since:
params["changedSince"] = since.strftime('%Y-%m-%d')
params["changedSince"] = since.strftime("%Y-%m-%d")

page = 0
record_count = 0
more = True

while more:
page += 1
params['page'] = page
params["page"] = page

logging.info(f"fetching sul_pub results {url} {params}")
resp = requests.get(url, params=params, headers=http_headers)
resp.raise_for_status()

records = resp.json()['records']
records = resp.json()["records"]
if len(records) == 0:
more = False

Expand Down
4 changes: 3 additions & 1 deletion rialto_airflow/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import os
import datetime


def last_harvest():
# TODO: look in the data_dir to determine the last harvest
return datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc)


def create_snapshot_dir(data_dir):
now = datetime.datetime.now()
snapshot_dir = os.path.join(data_dir, now.strftime('%Y%m%d%H%M%S'))
snapshot_dir = os.path.join(data_dir, now.strftime("%Y%m%d%H%M%S"))
os.mkdir(snapshot_dir)

return snapshot_dir
10 changes: 6 additions & 4 deletions test/harvest/test_sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@

dotenv.load_dotenv()

sul_pub_host = os.environ.get('AIRFLOW_VAR_SUL_PUB_HOST')
sul_pub_key = os.environ.get('AIRFLOW_VAR_SUL_PUB_KEY')
sul_pub_host = os.environ.get("AIRFLOW_VAR_SUL_PUB_HOST")
sul_pub_key = os.environ.get("AIRFLOW_VAR_SUL_PUB_KEY")

no_auth = not (sul_pub_host and sul_pub_key)


@pytest.mark.skipif(no_auth, reason="no sul_pub key")
def test_sul_pub_csv(tmpdir):
csv_file = tmpdir / "sul_pub.csv"
Expand All @@ -24,12 +25,13 @@ def test_sul_pub_csv(tmpdir):
assert len(df) == 2000
assert "title" in df.columns


@pytest.mark.skip(reason="sul_pub changeSince broken")
@pytest.mark.skipif(no_auth, reason="no sul_pub key")
def test_sul_pub_csv_since(tmpdir):
csv_file = tmpdir / "sul_pub.csv"
since = datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc)
sul_pub_csv(csv_file, sul_pub_host, sul_pub_key, since=since, limit=100)

df = pandas.read_csv(csv_file, parse_dates=['last_updated'])
assert len(df[df['last_updated'] < since]) == 0
df = pandas.read_csv(csv_file, parse_dates=["last_updated"])
assert len(df[df["last_updated"] < since]) == 0

0 comments on commit 497e2b1

Please sign in to comment.