Website links and spellcheck #912
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license | |
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee | |
# Ignores the following status codes to reduce false positives: | |
# - 401(Vimeo, 'unauthorized') | |
# - 403(OpenVINO, 'forbidden') | |
# - 429(Instagram, 'too many requests') | |
# - 500(Zenodo, 'cached') | |
# - 502(Zenodo, 'bad gateway') | |
# - 999(LinkedIn, 'unknown status code') | |
name: Check Website links | |
on: | |
workflow_dispatch: | |
pull_request: | |
schedule: | |
- cron: "0 0 * * *" # runs at 00:00 UTC every day | |
jobs: | |
Links: | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false # This ensures that if one job fails, the others will still run | |
matrix: | |
website: | |
[ | |
www.ultralytics.com, | |
docs.ultralytics.com, | |
community.ultralytics.com, | |
handbook.ultralytics.com, | |
] | |
steps: | |
- name: Download and install lychee | |
run: | | |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4) | |
curl -L $LYCHEE_URL | tar xz -C /usr/local/bin | |
- name: Get Website URLs | |
run: | | |
# Function to parse sitemap URLs | |
parse_sitemap() { | |
cat - | tr '\n' ' ' | sed 's/<loc>/\n<loc>/g' | grep -oP '(?<=<loc>).*?(?=</loc>)' || true | |
} | |
# Download initial sitemap and process | |
echo "Downloading sitemap..." | |
SITEMAP=$(wget -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; } | |
echo "$SITEMAP" | parse_sitemap > urls.txt | |
# Process any subsitemaps if they exist | |
if grep -q 'sitemap' urls.txt; then | |
echo "Found subsitemaps, processing..." | |
grep 'sitemap' urls.txt > subsitemaps.txt | |
grep -v 'sitemap' urls.txt > urls.tmp || true | |
while read -r submap; do | |
echo "Processing submap: $submap" | |
SUBMAP_CONTENT=$(wget -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; } | |
echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp | |
done < subsitemaps.txt | |
mv urls.tmp urls.txt || true | |
fi | |
# Count URLs | |
total_urls=$(wc -l < urls.txt) | |
echo "Total URLs to be downloaded: $total_urls" | |
- name: Download Website | |
run: | | |
# Set higher wait seconds for discourse community to avoid 429 rate limit errors | |
if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then | |
WAIT=1 | |
else | |
WAIT=0.001 | |
fi | |
# Download all URLs | |
wget \ | |
--adjust-extension \ | |
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \ | |
--input-file=urls.txt \ | |
--no-clobber \ | |
--no-parent \ | |
--wait=$WAIT \ | |
--random-wait \ | |
--tries=3 \ | |
--no-verbose \ | |
--force-directories | |
- name: Run codespell on downloaded pages | |
id: codespell | |
continue-on-error: true # Ensure the workflow continues even if spelling errors are found | |
run: | | |
pip install codespell | |
CODESPELL_OUTPUT=$(find ${{ matrix.website }} -type f -name "*.html" -print0 | xargs -0 codespell \ | |
--ignore-words-list "crate,nd,ned,strack,dota,ane,segway,fo,gool,winn,commend,bloc,nam,afterall,skelton,goin,referer,pre,uint,dto,linkedin,webp,webgl,href,onclick,github,api,http,png,svg,gif,jpg,jpeg,href,js" \ | |
--skip "*.pt,*.pth,*.torchscript,*.onnx,*.tflite,*.pb,*.bin,*.param,*.mlmodel,*.engine,*.npy,*.data*,*.csv,*pnnx*,*venv*,*translat*,*lock*,__pycache__*,*.ico,*.jpg,*.png,*.mp4,*.mov,/runs,/.git,./docs/??/*.md,./docs/mkdocs_??.yml" \ | |
2>&1 || true) | |
echo "$CODESPELL_OUTPUT" | |
# Process CODESPELL_OUTPUT | |
MODIFIED_OUTPUT=$(echo "$CODESPELL_OUTPUT" | sed 's#\(.*/\)[^/]*:[0-9]*: \(.*\)#\1 \2#') | |
echo "$MODIFIED_OUTPUT" | |
# Check for spelling errors | |
if [[ "$CODESPELL_OUTPUT" == *"==>"* ]]; then | |
echo "Spelling errors found ⚠️" | |
echo "CODESPELL_SUMMARY<<EOF" >> $GITHUB_ENV | |
echo "## 📝 Spelling Errors" >> $GITHUB_ENV | |
# Use MODIFIED_OUTPUT here instead of CODESPELL_OUTPUT | |
echo "$MODIFIED_OUTPUT" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# Set output for Slack notification | |
echo "CODESPELL_ERRORS=$CODESPELL_SUMMARY" >> $GITHUB_OUTPUT | |
echo "CODESPELL_FAILED=true" >> $GITHUB_OUTPUT | |
else | |
echo "No spelling errors found ✅" | |
echo "CODESPELL_FAILED=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Run Broken Link Checks on Website | |
id: lychee | |
uses: ultralytics/actions/retry@main | |
with: | |
timeout_minutes: 60 | |
retry_delay_seconds: 900 | |
retries: 3 | |
run: | | |
# Count successfully downloaded files | |
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l) | |
echo "Scanning $downloaded_files downloaded pages for broken links..." | |
# Create summary.txt with the total page count | |
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt | |
echo "" >> summary.txt | |
rm -rf .lycheecache | |
lychee \ | |
--scheme 'https' \ | |
--timeout 60 \ | |
--insecure \ | |
--accept 401,403,429,500,502,999 \ | |
--exclude-all-private \ | |
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \ | |
--exclude-path '**/ci.yaml' \ | |
--github-token ${{ secrets.GITHUB_TOKEN }} \ | |
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \ | |
--header "Accept=*/*" \ | |
--header "Accept-Language=*" \ | |
--header "Accept-Encoding=*" \ | |
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt | |
# Add the summary to GitHub step summary | |
cat summary.txt >> $GITHUB_STEP_SUMMARY | |
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters) | |
# Ignore lines starting with [TIMEOUT] on the next line or keep them in the following line | |
# ESCAPED_SUMMARY=$(awk '!/^\[TIMEOUT\]/ {printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g') | |
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g') | |
echo "SUMMARY<<EOF" >> $GITHUB_ENV | |
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# Check if lychee found any broken links | |
if grep -q "0 Errors" summary.txt; then | |
echo "No broken links found." | |
exit 0 | |
else | |
echo "Broken links found." | |
exit 1 | |
fi | |
- name: Add spelling errors to GitHub Summary | |
if: always() && steps.codespell.outputs.CODESPELL_FAILED == 'true' | |
run: | | |
echo "${{ env.CODESPELL_SUMMARY }}" >> $GITHUB_STEP_SUMMARY | |
- name: Notify Slack for broken links | |
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1' | |
uses: slackapi/[email protected] | |
with: | |
webhook-type: incoming-webhook | |
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }} | |
payload: | | |
text: "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n" | |
- name: Notify Slack for spelling errors | |
if: always() && steps.codespell.outputs.CODESPELL_FAILED == 'true' && github.event_name == 'schedule' && github.run_attempt == '1' | |
uses: slackapi/[email protected] | |
with: | |
webhook-type: incoming-webhook | |
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }} | |
payload: | | |
{ | |
"text": "GitHub Actions: Spelling errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n*Misspelled words:*\n${{ steps.codespell.outputs.CODESPELL_ERRORS }}\n" | |
} |