-
-
Notifications
You must be signed in to change notification settings - Fork 2
199 lines (174 loc) · 9.09 KB
/
links.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee
# Ignores the following status codes to reduce false positives:
# - 401(Vimeo, 'unauthorized')
# - 403(OpenVINO, 'forbidden')
# - 429(Instagram, 'too many requests')
# - 500(Zenodo, 'cached')
# - 502(Zenodo, 'bad gateway')
# - 999(LinkedIn, 'unknown status code')
name: Check Website links
on:
workflow_dispatch:
pull_request:
schedule:
- cron: "0 0 * * *" # runs at 00:00 UTC every day
jobs:
Links:
runs-on: ubuntu-latest
strategy:
fail-fast: false # This ensures that if one job fails, the others will still run
matrix:
website:
[
www.ultralytics.com,
docs.ultralytics.com,
community.ultralytics.com,
handbook.ultralytics.com,
]
steps:
- name: Download and install lychee
run: |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4)
curl -L $LYCHEE_URL | tar xz -C /usr/local/bin
- name: Get Website URLs
run: |
# Function to parse sitemap URLs
parse_sitemap() {
cat - | tr '\n' ' ' | sed 's/<loc>/\n<loc>/g' | grep -oP '(?<=<loc>).*?(?=</loc>)' || true
}
# Download initial sitemap and process
echo "Downloading sitemap..."
SITEMAP=$(wget -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; }
echo "$SITEMAP" | parse_sitemap > urls.txt
# Process any subsitemaps if they exist
if grep -q 'sitemap' urls.txt; then
echo "Found subsitemaps, processing..."
grep 'sitemap' urls.txt > subsitemaps.txt
grep -v 'sitemap' urls.txt > urls.tmp || true
while read -r submap; do
echo "Processing submap: $submap"
SUBMAP_CONTENT=$(wget -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; }
echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp
done < subsitemaps.txt
mv urls.tmp urls.txt || true
fi
# Count URLs
total_urls=$(wc -l < urls.txt)
echo "Total URLs to be downloaded: $total_urls"
- name: Download Website
run: |
# Set higher wait seconds for discourse community to avoid 429 rate limit errors
if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then
WAIT=1
else
WAIT=0.001
fi
# Download all URLs
wget \
--adjust-extension \
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
--input-file=urls.txt \
--no-clobber \
--no-parent \
--wait=$WAIT \
--random-wait \
--tries=3 \
--no-verbose \
--force-directories
- name: Run codespell on downloaded pages
id: codespell
continue-on-error: true # Ensure the workflow continues even if spelling errors are found
run: |
pip install codespell
CODESPELL_OUTPUT=$(find ${{ matrix.website }} -type f -name "*.html" -print0 | xargs -0 codespell \
--ignore-words-list "crate,nd,ned,strack,dota,ane,segway,fo,gool,winn,commend,bloc,nam,afterall,skelton,goin,referer,pre,uint,dto,linkedin,webp,webgl,href,onclick,github,api,http,png,svg,gif,jpg,jpeg,href,js" \
--skip "*.pt,*.pth,*.torchscript,*.onnx,*.tflite,*.pb,*.bin,*.param,*.mlmodel,*.engine,*.npy,*.data*,*.csv,*pnnx*,*venv*,*translat*,*lock*,__pycache__*,*.ico,*.jpg,*.png,*.mp4,*.mov,/runs,/.git,./docs/??/*.md,./docs/mkdocs_??.yml" \
2>&1 || true)
echo "$CODESPELL_OUTPUT"
# Process CODESPELL_OUTPUT
MODIFIED_OUTPUT=$(echo "$CODESPELL_OUTPUT" | sed 's#\(.*/\)[^/]*:[0-9]*: \(.*\)#\1 \2#')
echo "$MODIFIED_OUTPUT"
# Check for spelling errors
if [[ "$CODESPELL_OUTPUT" == *"==>"* ]]; then
echo "Spelling errors found ⚠️"
echo "CODESPELL_SUMMARY<<EOF" >> $GITHUB_ENV
echo "## 📝 Spelling Errors" >> $GITHUB_ENV
# Use MODIFIED_OUTPUT here instead of CODESPELL_OUTPUT
echo "$MODIFIED_OUTPUT" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Set output for Slack notification
echo "CODESPELL_ERRORS=$CODESPELL_SUMMARY" >> $GITHUB_OUTPUT
echo "CODESPELL_FAILED=true" >> $GITHUB_OUTPUT
else
echo "No spelling errors found ✅"
echo "CODESPELL_FAILED=false" >> $GITHUB_OUTPUT
fi
- name: Run Broken Link Checks on Website
id: lychee
uses: ultralytics/actions/retry@main
with:
timeout_minutes: 60
retry_delay_seconds: 900
retries: 3
run: |
# Count successfully downloaded files
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l)
echo "Scanning $downloaded_files downloaded pages for broken links..."
# Create summary.txt with the total page count
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt
echo "" >> summary.txt
rm -rf .lycheecache
lychee \
--scheme 'https' \
--timeout 60 \
--insecure \
--accept 401,403,429,500,502,999 \
--exclude-all-private \
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \
--exclude-path '**/ci.yaml' \
--github-token ${{ secrets.GITHUB_TOKEN }} \
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \
--header "Accept=*/*" \
--header "Accept-Language=*" \
--header "Accept-Encoding=*" \
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt
# Add the summary to GitHub step summary
cat summary.txt >> $GITHUB_STEP_SUMMARY
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters)
# Ignore lines starting with [TIMEOUT] on the next line or keep them in the following line
# ESCAPED_SUMMARY=$(awk '!/^\[TIMEOUT\]/ {printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
echo "SUMMARY<<EOF" >> $GITHUB_ENV
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Check if lychee found any broken links
if grep -q "0 Errors" summary.txt; then
echo "No broken links found."
exit 0
else
echo "Broken links found."
exit 1
fi
- name: Add spelling errors to GitHub Summary
if: always() && steps.codespell.outputs.CODESPELL_FAILED == 'true'
run: |
echo "${{ env.CODESPELL_SUMMARY }}" >> $GITHUB_STEP_SUMMARY
- name: Notify Slack for broken links
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
webhook-type: incoming-webhook
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}
payload: |
text: "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n"
- name: Notify Slack for spelling errors
if: always() && steps.codespell.outputs.CODESPELL_FAILED == 'true' && github.event_name == 'schedule' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
webhook-type: incoming-webhook
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}
payload: |
{
"text": "GitHub Actions: Spelling errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n*Misspelled words:*\n${{ steps.codespell.outputs.CODESPELL_ERRORS }}\n"
}