run-cluster #332
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: run-cluster | |
on: | |
workflow_dispatch: | |
inputs: | |
daft_wheel_url: | |
description: Daft python-wheel URL | |
type: string | |
required: false | |
daft_version: | |
description: Daft version (errors if both this and "Daft python-wheel URL" are provided) | |
type: string | |
required: false | |
python_version: | |
description: Python version | |
type: string | |
required: false | |
default: "3.9" | |
cluster_profile: | |
description: Cluster profile | |
type: choice | |
options: | |
- benchmarking-arm | |
- medium-x86 | |
- debug_xs-x86 | |
required: false | |
default: medium-x86 | |
working_dir: | |
description: Working directory | |
type: string | |
required: false | |
default: .github/working-dir | |
entrypoint_script: | |
description: Entry-point python script (must be inside of the working directory) | |
type: string | |
required: true | |
entrypoint_args: | |
description: Entry-point arguments (either a simple string or a JSON list) | |
type: string | |
required: false | |
default: "" | |
env_vars: | |
description: Environment variables | |
type: string | |
required: false | |
default: "" | |
jobs: | |
build-commit: | |
uses: ./.github/workflows/build-commit.yaml | |
if: ${{ inputs.daft_version == '' && inputs.daft_wheel_url == '' }} | |
with: | |
arch: ${{ (inputs.cluster_profile == 'debug_xs-x86' || inputs.cluster_profile == 'medium-x86') && 'x86' || 'arm' }} | |
python_version: ${{ inputs.python_version }} | |
secrets: | |
ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }} | |
run-command: | |
runs-on: [self-hosted, linux, x64, ci-dev] | |
# If both the `daft-version` and `daft-wheel-url` parameters are not specified, the `build-commit` job is entirely skipped. | |
# We still want to run this job, even if `build-commit` is skipped. | |
# The `always()` guarantees that this job is always run. | |
if: always() | |
permissions: | |
id-token: write | |
contents: read | |
needs: build-commit | |
steps: | |
- name: Log workflow inputs | |
run: echo "${{ toJson(github.event.inputs) }}" | |
- name: Checkout repo | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 1 | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-west-2 | |
role-session-name: run-command-workflow | |
- name: Install uv, rust, python | |
uses: ./.github/actions/install | |
with: | |
python_version: ${{ inputs.python_version }} | |
- name: Setup uv environment | |
run: | | |
uv v | |
source .venv/bin/activate | |
uv pip install ray[default] boto3 | |
GHA_OUTPUT_DIR=/tmp/outputs | |
mkdir -p $GHA_OUTPUT_DIR | |
echo "Output dir is set to $GHA_OUTPUT_DIR" | |
echo "GHA_OUTPUT_DIR=$GHA_OUTPUT_DIR" >> $GITHUB_ENV | |
- name: Dynamically update ray config file | |
run: | | |
source .venv/bin/activate | |
(cat .github/assets/template.yaml | \ | |
uv run \ | |
--python 3.12 \ | |
.github/ci-scripts/templatize_ray_config.py \ | |
--cluster-name="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ | |
--daft-wheel-url='${{ needs.build-commit.outputs.wheel_url || inputs.daft_wheel_url || '' }}' \ | |
--daft-version='${{ inputs.daft_version }}' \ | |
--python-version='${{ inputs.python_version }}' \ | |
--cluster-profile='${{ inputs.cluster_profile }}' \ | |
--working-dir='${{ inputs.working_dir }}' \ | |
--entrypoint-script='${{ inputs.entrypoint_script }}' | |
) >> .github/assets/ray.yaml | |
cat .github/assets/ray.yaml | |
- name: Download private ssh key | |
run: | | |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) | |
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
- name: Spin up ray cluster | |
run: | | |
source .venv/bin/activate | |
ray up .github/assets/ray.yaml -y | |
- name: Setup connection to ray cluster | |
run: | | |
source .venv/bin/activate | |
ray dashboard .github/assets/ray.yaml & | |
- name: Submit job to ray cluster | |
run: | | |
source .venv/bin/activate | |
if [[ -z '${{ inputs.entrypoint_script }}' ]]; then | |
echo 'Invalid command submitted; command cannot be empty' | |
exit 1 | |
fi | |
echo "Output dir: $GHA_OUTPUT_DIR" | |
python .github/ci-scripts/job_runner.py \ | |
--working-dir='${{ inputs.working_dir }}' \ | |
--entrypoint-script='${{ inputs.entrypoint_script }}' \ | |
--entrypoint-args='${{ inputs.entrypoint_args }}' \ | |
--env-vars='${{ inputs.env_vars }}' \ | |
--enable-ray-tracing | |
- name: Download log files from ray cluster | |
if: always() | |
run: | | |
source .venv/bin/activate | |
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs | |
find ray-daft-logs -depth -name '*:*' -exec bash -c ' | |
for filepath; do | |
dir=$(dirname "$filepath") | |
base=$(basename "$filepath") | |
new_base=${base//:/_} | |
mv "$filepath" "$dir/$new_base" | |
done | |
' _ {} + | |
- name: Kill connection to ray cluster | |
run: | | |
PID=$(lsof -t -i:8265) | |
if [[ -n "$PID" ]]; then | |
echo "Process $PID is listening on port 8265; killing it..." | |
kill -9 "$PID" | |
if [[ $? -eq 0 ]]; then | |
echo "Process $PID killed successfully" | |
else | |
echo "Failed to kill process $PID" | |
fi | |
fi | |
- name: Spin down ray cluster | |
if: always() | |
run: | | |
source .venv/bin/activate | |
ray down .github/assets/ray.yaml -y | |
- name: Upload output dir | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: outputs | |
path: ${{ env.GHA_OUTPUT_DIR }} | |
- name: Upload log files | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: ray-daft-logs | |
path: ray-daft-logs |