Revert back to sequential evals runs in CI #1175
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
- labeled | |
env: | |
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" | |
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" | |
jobs: | |
determine-evals: | |
runs-on: ubuntu-latest | |
outputs: | |
run-extract: ${{ steps.check-labels.outputs.run-extract }} | |
run-act: ${{ steps.check-labels.outputs.run-act }} | |
run-observe: ${{ steps.check-labels.outputs.run-observe }} | |
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} | |
steps: | |
- id: check-labels | |
run: | | |
# Default to running all tests on main branch | |
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
echo "Running all tests for main branch" | |
echo "run-extract=true" >> $GITHUB_OUTPUT | |
echo "run-act=true" >> $GITHUB_OUTPUT | |
echo "run-observe=true" >> $GITHUB_OUTPUT | |
echo "run-text-extract=true" >> $GITHUB_OUTPUT | |
exit 0 | |
fi | |
# Check for specific labels | |
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT | |
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT | |
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT | |
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT | |
echo "Extract label: ${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" | |
echo "Act label: ${{ contains(github.event.pull_request.labels.*.name, 'act') }}" | |
echo "Observe label: ${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" | |
echo "Text-Extract label: ${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" | |
run-lint: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Lint | |
run: npm run lint | |
run-build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Build | |
run: npm run build | |
run-e2e-tests: | |
needs: [run-lint, run-build] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (Deterministic Playwright) | |
run: npm run e2e | |
run-e2e-bb-tests: | |
needs: [run-e2e-tests] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
if: > | |
github.event_name == 'push' || | |
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (browserbase) | |
run: npm run e2e:bb | |
run-combination-evals: | |
needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 40 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Dummy Combination Evals | |
run: | | |
echo "Running dummy combination evals..." | |
echo '{"experimentName":"dummyCombination","categories":{"combination":95}}' > eval-summary.json | |
- name: Log Combination Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
combination_score=$(jq '.categories.combination' eval-summary.json) | |
echo "Combination category score: $combination_score%" | |
exit 0 | |
else | |
echo "Eval summary not found for combination category. Failing CI." | |
exit 1 | |
fi | |
# 2) Act depends on combination. If no label: exit 0 and pass. | |
run-act-evals: | |
needs: [run-combination-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'act' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
echo "No label for ACT. Exiting with success." | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Log value of run-act | |
run: | | |
echo "run-act: ${{ needs.determine-evals.outputs.run-act }}" | |
echo "run-act: ${{ contains(github.event.pull_request.labels.*.name, 'act') }}" | |
- name: Set up Node.js | |
if: needs.determine-evals.outputs.run-act == 'true' | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: npm install --no-frozen-lockfile | |
- name: Build Stagehand | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: npm run build | |
- name: Dummy Act Evals | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: | | |
echo "Running dummy act evals..." | |
echo '{"experimentName":"dummyAct","categories":{"act":82}}' > eval-summary.json | |
- name: Log Act Evals Performance | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
act_score=$(jq '.categories.act' eval-summary.json) | |
echo "Act category score: $act_score%" | |
if (( $(echo "$act_score < 80" | bc -l) )); then | |
echo "Act category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for act category. Failing CI." | |
exit 1 | |
fi | |
# 3) Extract depends on act. If no label: exit 0 and pass. | |
run-extract-evals: | |
needs: [run-act-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'extract' label | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then | |
echo "No label for EXTRACT. Exiting with success." | |
exit 0 | |
fi | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Build Stagehand | |
run: npm run build | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
# 1. Run extract category with domExtract | |
- name: Dummy Extract Evals (domExtract) | |
run: | | |
echo "Running dummy extract evals (domExtract)..." | |
echo '{"experimentName":"dummyDomExtract","categories":{"extract":85}}' > eval-summary.json | |
- name: Save Extract Dom Results | |
run: mv eval-summary.json eval-summary-extract-dom.json | |
# 2. Then run extract category with textExtract | |
- name: Dummy Extract Evals (textExtract) | |
run: | | |
echo "Running dummy extract evals (textExtract)..." | |
echo '{"experimentName":"dummyTextExtract","categories":{"extract":90}}' > eval-summary.json | |
- name: Save Extract Text Results | |
run: mv eval-summary.json eval-summary-extract-text.json | |
# 3. Log and Compare | |
- name: Log and Compare Extract Evals Performance | |
run: | | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
echo "DomExtract Extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) | |
text_score=$(jq '.categories.extract' eval-summary-extract-text.json) | |
echo "TextExtract Extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
# If domExtract <80% fail CI | |
if (( $(echo "$dom_score < 80" | bc -l) )); then | |
echo "DomExtract extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
# 4) Text-extract depends on extract. If no label: exit 0 and pass. | |
run-text-extract-evals: | |
needs: [run-extract-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'text-extract' label | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-text-extract }}" != "true" ]; then | |
echo "No label for TEXT-EXTRACT. Exiting with success." | |
exit 0 | |
fi | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
# 1. text_extract with textExtract first | |
- name: Dummy text_extract Evals (textExtract) | |
run: | | |
echo "Running dummy text_extract (textExtract)..." | |
echo '{"experimentName":"dummyTextExtract","categories":{"text_extract":90}}' > eval-summary.json | |
- name: Save text_extract Text Results | |
run: mv eval-summary.json eval-summary-text_extract-text.json | |
# 2. Then domExtract | |
- name: Dummy text_extract Evals (domExtract) | |
run: | | |
echo "Running dummy text_extract (domExtract)..." | |
echo '{"experimentName":"dummyDomExtract","categories":{"text_extract":88}}' > eval-summary.json | |
- name: Save text_extract Dom Results | |
run: mv eval-summary.json eval-summary-text_extract-dom.json | |
# 3. Log and Compare | |
- name: Log and Compare text_extract Evals Performance | |
run: | | |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) | |
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) | |
echo "TextExtract text_extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) | |
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) | |
echo "DomExtract text_extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
# If text_score <80% fail CI | |
if (( $(echo "$text_score < 80" | bc -l) )); then | |
echo "textExtract text_extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
# 5) Observe depends on text-extract. If no label: exit 0 and pass. | |
run-observe-evals: | |
needs: [run-text-extract-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'observe' label | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then | |
echo "No label for OBSERVE. Exiting with success." | |
exit 0 | |
fi | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Dummy Observe Evals | |
run: | | |
echo "Running dummy observe evals..." | |
echo '{"experimentName":"dummyObserve","categories":{"observe":85}}' > eval-summary.json | |
- name: Log Observe Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
observe_score=$(jq '.categories.observe' eval-summary.json) | |
echo "Observe category score: $observe_score%" | |
if (( $(echo "$observe_score < 80" | bc -l) )); then | |
echo "Observe category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for observe category. Failing CI." | |
exit 1 | |
fi |