Skip to content

run-cluster

run-cluster #332

Workflow file for this run

name: run-cluster
on:
workflow_dispatch:
inputs:
daft_wheel_url:
description: Daft python-wheel URL
type: string
required: false
daft_version:
description: Daft version (errors if both this and "Daft python-wheel URL" are provided)
type: string
required: false
python_version:
description: Python version
type: string
required: false
default: "3.9"
cluster_profile:
description: Cluster profile
type: choice
options:
- benchmarking-arm
- medium-x86
- debug_xs-x86
required: false
default: medium-x86
working_dir:
description: Working directory
type: string
required: false
default: .github/working-dir
entrypoint_script:
description: Entry-point python script (must be inside of the working directory)
type: string
required: true
entrypoint_args:
description: Entry-point arguments (either a simple string or a JSON list)
type: string
required: false
default: ""
env_vars:
description: Environment variables
type: string
required: false
default: ""
jobs:
build-commit:
uses: ./.github/workflows/build-commit.yaml
if: ${{ inputs.daft_version == '' && inputs.daft_wheel_url == '' }}
with:
arch: ${{ (inputs.cluster_profile == 'debug_xs-x86' || inputs.cluster_profile == 'medium-x86') && 'x86' || 'arm' }}
python_version: ${{ inputs.python_version }}
secrets:
ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
run-command:
runs-on: [self-hosted, linux, x64, ci-dev]
# If both the `daft-version` and `daft-wheel-url` parameters are not specified, the `build-commit` job is entirely skipped.
# We still want to run this job, even if `build-commit` is skipped.
# The `always()` guarantees that this job is always run.
if: always()
permissions:
id-token: write
contents: read
needs: build-commit
steps:
- name: Log workflow inputs
run: echo "${{ toJson(github.event.inputs) }}"
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-session-name: run-command-workflow
- name: Install uv, rust, python
uses: ./.github/actions/install
with:
python_version: ${{ inputs.python_version }}
- name: Setup uv environment
run: |
uv v
source .venv/bin/activate
uv pip install ray[default] boto3
GHA_OUTPUT_DIR=/tmp/outputs
mkdir -p $GHA_OUTPUT_DIR
echo "Output dir is set to $GHA_OUTPUT_DIR"
echo "GHA_OUTPUT_DIR=$GHA_OUTPUT_DIR" >> $GITHUB_ENV
- name: Dynamically update ray config file
run: |
source .venv/bin/activate
(cat .github/assets/template.yaml | \
uv run \
--python 3.12 \
.github/ci-scripts/templatize_ray_config.py \
--cluster-name="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
--daft-wheel-url='${{ needs.build-commit.outputs.wheel_url || inputs.daft_wheel_url || '' }}' \
--daft-version='${{ inputs.daft_version }}' \
--python-version='${{ inputs.python_version }}' \
--cluster-profile='${{ inputs.cluster_profile }}' \
--working-dir='${{ inputs.working_dir }}' \
--entrypoint-script='${{ inputs.entrypoint_script }}'
) >> .github/assets/ray.yaml
cat .github/assets/ray.yaml
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up .github/assets/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard .github/assets/ray.yaml &
- name: Submit job to ray cluster
run: |
source .venv/bin/activate
if [[ -z '${{ inputs.entrypoint_script }}' ]]; then
echo 'Invalid command submitted; command cannot be empty'
exit 1
fi
echo "Output dir: $GHA_OUTPUT_DIR"
python .github/ci-scripts/job_runner.py \
--working-dir='${{ inputs.working_dir }}' \
--entrypoint-script='${{ inputs.entrypoint_script }}' \
--entrypoint-args='${{ inputs.entrypoint_args }}' \
--env-vars='${{ inputs.env_vars }}' \
--enable-ray-tracing
- name: Download log files from ray cluster
if: always()
run: |
source .venv/bin/activate
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
find ray-daft-logs -depth -name '*:*' -exec bash -c '
for filepath; do
dir=$(dirname "$filepath")
base=$(basename "$filepath")
new_base=${base//:/_}
mv "$filepath" "$dir/$new_base"
done
' _ {} +
- name: Kill connection to ray cluster
run: |
PID=$(lsof -t -i:8265)
if [[ -n "$PID" ]]; then
echo "Process $PID is listening on port 8265; killing it..."
kill -9 "$PID"
if [[ $? -eq 0 ]]; then
echo "Process $PID killed successfully"
else
echo "Failed to kill process $PID"
fi
fi
- name: Spin down ray cluster
if: always()
run: |
source .venv/bin/activate
ray down .github/assets/ray.yaml -y
- name: Upload output dir
if: always()
uses: actions/upload-artifact@v4
with:
name: outputs
path: ${{ env.GHA_OUTPUT_DIR }}
- name: Upload log files
if: always()
uses: actions/upload-artifact@v4
with:
name: ray-daft-logs
path: ray-daft-logs