Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent d573fba commit ae6a54d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 25 deletions.
21 changes: 1 addition & 20 deletions provider/docs/src/corpus/doc/contentExtractor.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Readability } from '@mozilla/readability'
import { parseDOM } from '../../dom'
import { type Doc } from './doc'

export interface Content {
Expand Down Expand Up @@ -28,26 +29,6 @@ export interface ContentExtractor {
export const extractContentUsingMozillaReadability: ContentExtractor = {
id: 'mozillaReadability',
async extractContent(doc) {
type ParseDOM = (html: string, url: string | undefined) => Promise<Document>
const parseDOM: ParseDOM =
typeof DOMParser === 'undefined'
? async (html, url) => {
const { JSDOM } = await import('jsdom')
return new JSDOM(html, { url }).window.document
}
: (html, url) => {
const document = new DOMParser().parseFromString(html, 'text/html')

// Set base URL.
if (url && document.head.querySelectorAll('base').length === 0) {
const baseEl = document.createElement('base')
baseEl.setAttribute('href', url)
document.head.append(baseEl)
}

return Promise.resolve(document)
}

const info = new Readability(await parseDOM(doc.text, doc.url), {
charThreshold: 500,
}).parse()
Expand Down
9 changes: 4 additions & 5 deletions provider/docs/src/corpus/source/web/webCorpusSource.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { JSDOM } from 'jsdom'
import { parseDOM } from '../../../dom'
import { Logger } from '../../../logger'
import { Doc } from '../../doc/doc'
import { CorpusSource } from '../source'
Expand Down Expand Up @@ -57,10 +57,9 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
}

const html = await resp.text()
const dom = new JSDOM(html, { url: resp.url })
const dom = await parseDOM(html, resp.url)

const canonicalURLStr =
dom.window.document.querySelector<HTMLLinkElement>("head > link[rel='canonical']")?.href
const canonicalURLStr = dom.querySelector<HTMLLinkElement>("head > link[rel='canonical']")?.href
if (canonicalURLStr && canonicalURLStr !== url.href) {
const canonicalURL = new URL(canonicalURLStr)

Expand All @@ -81,7 +80,7 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
url: url.toString(),
})

const pageLinks = dom.window.document.querySelectorAll<HTMLAnchorElement>('a[href]')
const pageLinks = dom.querySelectorAll<HTMLAnchorElement>('a[href]')
logger?.(`- Found ${pageLinks.length} links on page`)
for (const link of pageLinks) {
enqueueURL(new URL(link.href))
Expand Down
23 changes: 23 additions & 0 deletions provider/docs/src/dom.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export type ParseDOM = (html: string, url: string | undefined) => Promise<Document>

/**
* Parse DOM (works in both Node and browser).
*/
export const parseDOM: ParseDOM =
typeof DOMParser === 'undefined'
? async (html, url) => {
const { JSDOM } = await import('jsdom')
return new JSDOM(html, { url }).window.document
}
: (html, url) => {
const document = new DOMParser().parseFromString(html, 'text/html')

// Set base URL.
if (url && document.head.querySelectorAll('base').length === 0) {
const baseEl = document.createElement('base')
baseEl.setAttribute('href', url)
document.head.append(baseEl)
}

return Promise.resolve(document)
}

0 comments on commit ae6a54d

Please sign in to comment.