Skip to content

Commit

Permalink
pick up rialto orgs author ORCIDs
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jun 19, 2024
1 parent 3376e2b commit da59bce
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 2 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ for i in `vault kv list -format yaml puppet/application/rialto-airflow/dev | sed
done
```

5. The harvest DAG requires a CSV file of authors from rialto-orgs to be available. This is not yet automatically available, so to set up locally, download the file at
https://sul-rialto-dev.stanford.edu/authors?action=index&commit=Search&controller=authors&format=csv&orcid_filter=&q=. Put the `authors.csv` file in the `data/` directory.

## Development

### Set-up
Expand All @@ -76,6 +79,7 @@ This will create the virtual environment at the default location of `.venv/`. `u
source .venv/bin/activate
```


### Install dependencies
```
uv pip install -r requirements.txt
Expand Down
129 changes: 129 additions & 0 deletions rialto_airflow/dags/example_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
### Tutorial Documentation
Documentation that goes along with the Airflow tutorial located
[here](https://airflow.apache.org/tutorial.html)
"""

from __future__ import annotations

# [START tutorial]
# [START import_module]
import textwrap
from datetime import datetime, timedelta

# The DAG object; we'll need this to instantiate a DAG
from airflow.models.dag import DAG

# Operators; we need this to operate!
from airflow.operators.bash import BashOperator

# [END import_module]


# [START instantiate_dag]
with DAG(
"tutorial",
# [START default_args]
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args={
"depends_on_past": False,
"email": ["[email protected]"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
# 'wait_for_downstream': False,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function, # or list of functions
# 'on_success_callback': some_other_function, # or list of functions
# 'on_retry_callback': another_function, # or list of functions
# 'sla_miss_callback': yet_another_function, # or list of functions
# 'on_skipped_callback': another_function, #or list of functions
# 'trigger_rule': 'all_success'
},
# [END default_args]
description="A simple tutorial DAG",
schedule=timedelta(days=1),
start_date=datetime(2021, 1, 1),
catchup=False,
tags=["example"],
) as dag:
# [END instantiate_dag]
# t1, t2 and t3 are examples of tasks created by instantiating operators
# [START basic_task]

t1 = BashOperator(
task_id="print_date",
bash_command="date",
)

t2 = BashOperator(
task_id="sleep",
depends_on_past=False,
bash_command="sleep 5",
retries=3,
)
# [END basic_task]

# [START documentation]
t1.doc_md = textwrap.dedent(
"""\
#### Task Documentation
You can document your task using the attributes `doc_md` (markdown),
`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
rendered in the UI's Task Instance Details page.
![img](https://imgs.xkcd.com/comics/fixing_problems.png)
**Image Credit:** Randall Munroe, [XKCD](https://xkcd.com/license.html)
"""
)

dag.doc_md = (
__doc__ # providing that you have a docstring at the beginning of the DAG; OR
)
dag.doc_md = """
This is a documentation placed anywhere
""" # otherwise, type it like this
# [END documentation]

# [START jinja_template]
templated_command = textwrap.dedent(
"""
{% for i in range(5) %}
echo "{{ ds }}"
echo "{{ macros.ds_add(ds, 7)}}"
{% endfor %}
"""
)

t3 = BashOperator(
task_id="templated",
depends_on_past=False,
bash_command=templated_command,
)
# [END jinja_template]

t1 >> [t2, t3]
# [END tutorial]
21 changes: 20 additions & 1 deletion rialto_airflow/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import csv
import datetime

from pathlib import Path


Expand All @@ -14,3 +14,22 @@ def create_snapshot_dir(data_dir):
snapshot_dir.mkdir()

return str(snapshot_dir)


def rialto_authors_file(data_dir):
"""Get the path to the rialto-orgs authors.csv"""
authors_file = Path(data_dir) / "authors.csv"

return authors_file


def rialto_authors_orcids(rialto_authors_file):
"""Extract the orcidid column from the authors.csv file"""
orcids = []
with open(rialto_authors_file, "r") as file:
reader = csv.reader(file)
header = next(reader)
orcidid = header.index("orcidid")
for row in reader:
orcids.append(row[orcidid])
return orcids
21 changes: 20 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
import csv
from pathlib import Path
import pytest
from rialto_airflow.utils import create_snapshot_dir, rialto_authors_orcids

from rialto_airflow.utils import create_snapshot_dir

@pytest.fixture
def authors_csv(tmp_path):
# Create a fixture authors CSV file
fixture_file = tmp_path / "authors.csv"
with open(fixture_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["sunetid", "orcidid"])
writer.writerow(["author1", "https://orcid.org/0000-0000-0000-0001"])
writer.writerow(["author2", "https://orcid.org/0000-0000-0000-0002"])
return fixture_file


def test_create_snapshot_dir(tmpdir):
snap_dir = Path(create_snapshot_dir(tmpdir))
assert snap_dir.is_dir()


def test_rialto_authors_orcids(tmp_path, authors_csv):
orcids = rialto_authors_orcids(authors_csv)
assert len(orcids) == 2
assert "https://orcid.org/0000-0000-0000-0001" in orcids

0 comments on commit da59bce

Please sign in to comment.