Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent 23825f2 commit 05d9d97
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions provider/docs/src/corpus/source/web/webCorpusSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,13 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo

const canonicalURLStr =
dom.window.document.querySelector<HTMLLinkElement>("head > link[rel='canonical']")?.href
if (canonicalURLStr) {
if (canonicalURLStr && canonicalURLStr !== url.href) {
const canonicalURL = new URL(canonicalURLStr)

// Only trust the canonical URL if it's same-origin, to avoid letting other
// sites pollute this corpus.
if (canonicalURL.origin === url.origin) {
logger?.(`- Found canonical URL: ${canonicalURL}`)
url = canonicalURL
if (!shouldCrawlURL(url)) {
continue
Expand All @@ -81,7 +82,9 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
url: url.toString(),
})

for (const link of dom.window.document.querySelectorAll<HTMLAnchorElement>('a[href]')) {
const pageLinks = dom.window.document.querySelectorAll<HTMLAnchorElement>('a[href]')
logger?.(`- Found ${pageLinks.length} links on page`)
for (const link of pageLinks) {
enqueueURL(new URL(link.href))
}
}
Expand Down

0 comments on commit 05d9d97

Please sign in to comment.