Revert back to sequential evals runs in CI #1159
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
- labeled | |
env: | |
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" | |
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" | |
jobs: | |
determine-evals: | |
runs-on: ubuntu-latest | |
outputs: | |
run-extract: ${{ steps.check-labels.outputs.run-extract }} | |
run-act: ${{ steps.check-labels.outputs.run-act }} | |
run-observe: ${{ steps.check-labels.outputs.run-observe }} | |
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} | |
steps: | |
- id: check-labels | |
run: | | |
# Default to running all tests on main branch | |
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
echo "Running all tests for main branch" | |
echo "run-extract=true" >> $GITHUB_OUTPUT | |
echo "run-act=true" >> $GITHUB_OUTPUT | |
echo "run-observe=true" >> $GITHUB_OUTPUT | |
echo "run-text-extract=true" >> $GITHUB_OUTPUT | |
exit 0 | |
fi | |
# Check for specific labels | |
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT | |
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT | |
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT | |
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT | |
run-lint: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Lint | |
run: npm run lint | |
run-build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Build | |
run: npm run build | |
run-e2e-tests: | |
needs: [run-lint, run-build] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (Deterministic Playwright) | |
run: npm run e2e | |
run-e2e-bb-tests: | |
needs: [run-e2e-tests] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
if: > | |
github.event_name == 'push' || | |
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (browserbase) | |
run: npm run e2e:bb | |
# -------------------------------------------------------------------- | |
# Dummy eval steps from combination onward, skipping if label missing | |
# -------------------------------------------------------------------- | |
run-combination-evals: | |
needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 40 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Dummy Combination Evals | |
run: | | |
echo "Running dummy combination evals..." | |
echo '{"experimentName":"dummyCombination","categories":{"combination":95}}' > eval-summary.json | |
- name: Log Combination Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.example.com/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
combination_score=$(jq '.categories.combination' eval-summary.json) | |
echo "Combination category score: $combination_score%" | |
exit 0 | |
else | |
echo "Eval summary not found for combination category. Failing CI." | |
exit 1 | |
fi | |
run-act-evals: | |
needs: [run-combination-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'act' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then | |
echo "No label selected for \"act\", continuing without running \"act\"." | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Dummy Act Evals | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy act evals..." | |
echo '{"experimentName":"dummyAct","categories":{"act":82}}' > eval-summary.json | |
- name: Log Act Evals Performance | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.example.com/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
act_score=$(jq '.categories.act' eval-summary.json) | |
echo "Act category score: $act_score%" | |
if (( $(echo "$act_score < 80" | bc -l) )); then | |
echo "Act category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for act category. Failing CI." | |
exit 1 | |
fi | |
run-extract-evals: | |
needs: [run-act-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'extract' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then | |
echo "No label selected for \"extract\", continuing without running \"extract\"." | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
# 1. Dummy "domExtract" | |
- name: Dummy Extract Evals (domExtract) | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy extract evals (domExtract)..." | |
echo '{"experimentName":"dummyDomExtract","categories":{"extract":85}}' > eval-summary.json | |
- name: Save Extract Dom Results | |
if: steps.label-check.outputs.has_label == 'true' | |
run: mv eval-summary.json eval-summary-extract-dom.json | |
# 2. Dummy "textExtract" | |
- name: Dummy Extract Evals (textExtract) | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy extract evals (textExtract)..." | |
echo '{"experimentName":"dummyTextExtract","categories":{"extract":90}}' > eval-summary.json | |
- name: Save Extract Text Results | |
if: steps.label-check.outputs.has_label == 'true' | |
run: mv eval-summary.json eval-summary-extract-text.json | |
# 3. Log and Compare | |
- name: Log and Compare Extract Evals Performance | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
echo "DomExtract Extract category score: $dom_score%" | |
echo "View domExtract results: https://www.example.com/experiments/${experimentNameDom}" | |
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) | |
text_score=$(jq '.categories.extract' eval-summary-extract-text.json) | |
echo "TextExtract Extract category score: $text_score%" | |
echo "View textExtract results: https://www.example.com/experiments/${experimentNameText}" | |
# If domExtract <80% fail CI | |
if (( $(echo "$dom_score < 80" | bc -l) )); then | |
echo "DomExtract extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-text-extract-evals: | |
needs: [run-extract-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'text-extract' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-text-extract }}" != "true" ]; then | |
echo "No label selected for \"text-extract\", continuing without running \"text-extract\"." | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
# 1. Dummy text_extract with textExtract | |
- name: Dummy text_extract Evals (textExtract) | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy text_extract (textExtract)..." | |
echo '{"experimentName":"dummyTextExtract","categories":{"text_extract":90}}' > eval-summary.json | |
- name: Save text_extract Text Results | |
if: steps.label-check.outputs.has_label == 'true' | |
run: mv eval-summary.json eval-summary-text_extract-text.json | |
# 2. Dummy text_extract with domExtract | |
- name: Dummy text_extract Evals (domExtract) | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy text_extract (domExtract)..." | |
echo '{"experimentName":"dummyDomExtract","categories":{"text_extract":88}}' > eval-summary.json | |
- name: Save text_extract Dom Results | |
if: steps.label-check.outputs.has_label == 'true' | |
run: mv eval-summary.json eval-summary-text_extract-dom.json | |
# 3. Log and Compare text_extract Evals | |
- name: Log and Compare text_extract Evals Performance | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) | |
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) | |
echo "TextExtract text_extract category score: $text_score%" | |
echo "View textExtract results: https://www.example.com/experiments/${experimentNameText}" | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) | |
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) | |
echo "DomExtract text_extract category score: $dom_score%" | |
echo "View domExtract results: https://www.example.com/experiments/${experimentNameDom}" | |
# If text_score <80% fail CI | |
if (( $(echo "$text_score < 80" | bc -l) )); then | |
echo "textExtract text_extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-observe-evals: | |
needs: [run-text-extract-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'observe' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then | |
echo "No label selected for \"observe\", continuing without running \"observe\"." | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Dummy Observe Evals | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
echo "Running dummy observe evals..." | |
echo '{"experimentName":"dummyObserve","categories":{"observe":85}}' > eval-summary.json | |
- name: Log Observe Evals Performance | |
if: steps.label-check.outputs.has_label == 'true' | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.example.com/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
observe_score=$(jq '.categories.observe' eval-summary.json) | |
echo "Observe category score: $observe_score%" | |
if (( $(echo "$observe_score < 80" | bc -l) )); then | |
echo "Observe category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for observe category. Failing CI." | |
exit 1 | |
fi |