Skip to content

Add spellchecks to actions #904

Add spellchecks to actions

Add spellchecks to actions #904

Workflow file for this run

# Ultralytics πŸš€ AGPL-3.0 License - https://ultralytics.com/license
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee
# Ignores the following status codes to reduce false positives:
# - 401(Vimeo, 'unauthorized')
# - 403(OpenVINO, 'forbidden')
# - 429(Instagram, 'too many requests')
# - 500(Zenodo, 'cached')
# - 502(Zenodo, 'bad gateway')
# - 999(LinkedIn, 'unknown status code')
name: Check Website links
on:
workflow_dispatch:
pull_request:
schedule:
- cron: "0 0 * * *" # runs at 00:00 UTC every day
jobs:
Links:
runs-on: ubuntu-latest
strategy:
fail-fast: false # This ensures that if one job fails, the others will still run
matrix:
website:
[
www.ultralytics.com,
docs.ultralytics.com,
handbook.ultralytics.com,
]
steps:
- name: Download and install lychee
run: |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4)
curl -L $LYCHEE_URL | tar xz -C /usr/local/bin
- name: Get Website URLs
run: |
# Function to parse sitemap URLs
parse_sitemap() {
cat - | tr '\n' ' ' | sed 's/<loc>/\n<loc>/g' | grep -oP '(?<=<loc>).*?(?=</loc>)' || true
}
# Download initial sitemap and process
echo "Downloading sitemap..."
SITEMAP=$(wget -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; }
echo "$SITEMAP" | parse_sitemap > urls.txt
# Process any subsitemaps if they exist
if grep -q 'sitemap' urls.txt; then
echo "Found subsitemaps, processing..."
grep 'sitemap' urls.txt > subsitemaps.txt
grep -v 'sitemap' urls.txt > urls.tmp || true
while read -r submap; do
echo "Processing submap: $submap"
SUBMAP_CONTENT=$(wget -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; }
echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp
done < subsitemaps.txt
mv urls.tmp urls.txt || true
fi
# Count URLs
total_urls=$(wc -l < urls.txt)
echo "Total URLs to be downloaded: $total_urls"
- name: Download Website
run: |
# Set higher wait seconds for discourse community to avoid 429 rate limit errors
if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then
WAIT=1
else
WAIT=0.001
fi
# Download all URLs
wget \
--adjust-extension \
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
--input-file=urls.txt \
--no-clobber \
--no-parent \
--wait=$WAIT \
--random-wait \
--tries=3 \
--no-verbose \
--force-directories
- name: Run codespell on downloaded pages
id: codespell
continue-on-error: true
run: |
pip install codespell
CODESPELL_OUTPUT=$(find ${{ matrix.website }} -type f -name "*.html" -print0 | xargs -0 codespell \
--ignore-words-list "Hache,Calle,ore,COO,MOT,crate,nd,ned,strack,dota,ane,segway,fo,gool,winn,commend,bloc,nam,afterall,skelton,goin,referer,pre,uint,dto,linkedin,webp,webgl,href,onclick,github,api,http,png,svg,gif,jpg,jpeg,href,js" \
--skip "*.pt,*.pth,*.torchscript,*.onnx,*.tflite,*.pb,*.bin,*.param,*.mlmodel,*.engine,*.npy,*.data*,*.csv,*pnnx*,*venv*,*translat*,*lock*,__pycache__*,*.ico,*.jpg,*.png,*.mp4,*.mov,/runs,/.git,./docs/??/*.md,./docs/mkdocs_??.yml" \
2>&1 || true)
# Process CODESPELL_OUTPUT
MODIFIED_OUTPUT=$(echo "$CODESPELL_OUTPUT" | sed 's#\(.*\):[0-9]*: \(.*\)#\1 \2#; s/\.html//; s#\(.*\)/index #\1/ #')
# Prepare the output for Slack with explicit newlines
ESCAPED_OUTPUT=$(echo "$MODIFIED_OUTPUT" | awk '{printf "%s\\n", $0}')
echo "CODESPELL_ERRORS<<EOF" >> $GITHUB_ENV
echo "$ESCAPED_OUTPUT" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
echo "$MODIFIED_OUTPUT"
if [[ "$CODESPELL_OUTPUT" == *"==>"* ]]; then
echo "Spelling errors found ⚠️"
echo "## πŸ“ Spelling Errors" >> $GITHUB_STEP_SUMMARY
echo "$MODIFIED_OUTPUT" >> $GITHUB_STEP_SUMMARY
exit 1
fi
echo "No spelling errors found βœ…"
- name: Run Broken Link Checks on Website
id: lychee
uses: ultralytics/actions/retry@main
with:
timeout_minutes: 60
retry_delay_seconds: 900
retries: 3
run: |
# Count successfully downloaded files
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l)
echo "Scanning $downloaded_files downloaded pages for broken links..."
# Create summary.txt with the total page count
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt
echo "" >> summary.txt
rm -rf .lycheecache
lychee \
--scheme 'https' \
--timeout 60 \
--insecure \
--accept 401,403,429,500,502,999 \
--exclude-all-private \
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \
--exclude-path '**/ci.yaml' \
--github-token ${{ secrets.GITHUB_TOKEN }} \
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \
--header "Accept=*/*" \
--header "Accept-Language=*" \
--header "Accept-Encoding=*" \
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt
# Add the summary to GitHub step summary
cat summary.txt >> $GITHUB_STEP_SUMMARY
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters)
# Ignore lines starting with [TIMEOUT] on the next line or keep them in the following line
# ESCAPED_SUMMARY=$(awk '!/^\[TIMEOUT\]/ {printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
echo "SUMMARY<<EOF" >> $GITHUB_ENV
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Check if lychee found any broken links
if grep -q "0 Errors" summary.txt; then
echo "No broken links found."
else
echo "Broken links found."
exit 1
fi
- name: Notify Slack for broken links
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
webhook-type: incoming-webhook
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}
payload: |
text: "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n"
- name: Notify Slack for spelling errors
if: always() && steps.codespell.outcome == 'failure' && github.event_name == 'pull_request' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
webhook-type: incoming-webhook
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}
payload: |
{
"text": "GitHub Actions: Spelling errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n*πŸ“ Spelling Errors:*\n${{ env.CODESPELL_ERRORS }}\n"
}