From ae6a54d72fa3cedc9bf2e7134bfd3783a66c5c3f Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Tue, 26 Dec 2023 01:08:21 -0600 Subject: [PATCH] wip --- .../docs/src/corpus/doc/contentExtractor.ts | 21 +---------------- .../src/corpus/source/web/webCorpusSource.ts | 9 ++++---- provider/docs/src/dom.ts | 23 +++++++++++++++++++ 3 files changed, 28 insertions(+), 25 deletions(-) create mode 100644 provider/docs/src/dom.ts diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts index 7b09a52c..1b439099 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -1,4 +1,5 @@ import { Readability } from '@mozilla/readability' +import { parseDOM } from '../../dom' import { type Doc } from './doc' export interface Content { @@ -28,26 +29,6 @@ export interface ContentExtractor { export const extractContentUsingMozillaReadability: ContentExtractor = { id: 'mozillaReadability', async extractContent(doc) { - type ParseDOM = (html: string, url: string | undefined) => Promise - const parseDOM: ParseDOM = - typeof DOMParser === 'undefined' - ? async (html, url) => { - const { JSDOM } = await import('jsdom') - return new JSDOM(html, { url }).window.document - } - : (html, url) => { - const document = new DOMParser().parseFromString(html, 'text/html') - - // Set base URL. - if (url && document.head.querySelectorAll('base').length === 0) { - const baseEl = document.createElement('base') - baseEl.setAttribute('href', url) - document.head.append(baseEl) - } - - return Promise.resolve(document) - } - const info = new Readability(await parseDOM(doc.text, doc.url), { charThreshold: 500, }).parse() diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.ts b/provider/docs/src/corpus/source/web/webCorpusSource.ts index c2d44c79..f238942c 100644 --- a/provider/docs/src/corpus/source/web/webCorpusSource.ts +++ b/provider/docs/src/corpus/source/web/webCorpusSource.ts @@ -1,4 +1,4 @@ -import { JSDOM } from 'jsdom' +import { parseDOM } from '../../../dom' import { Logger } from '../../../logger' import { Doc } from '../../doc/doc' import { CorpusSource } from '../source' @@ -57,10 +57,9 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo } const html = await resp.text() - const dom = new JSDOM(html, { url: resp.url }) + const dom = await parseDOM(html, resp.url) - const canonicalURLStr = - dom.window.document.querySelector("head > link[rel='canonical']")?.href + const canonicalURLStr = dom.querySelector("head > link[rel='canonical']")?.href if (canonicalURLStr && canonicalURLStr !== url.href) { const canonicalURL = new URL(canonicalURLStr) @@ -81,7 +80,7 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo url: url.toString(), }) - const pageLinks = dom.window.document.querySelectorAll('a[href]') + const pageLinks = dom.querySelectorAll('a[href]') logger?.(`- Found ${pageLinks.length} links on page`) for (const link of pageLinks) { enqueueURL(new URL(link.href)) diff --git a/provider/docs/src/dom.ts b/provider/docs/src/dom.ts new file mode 100644 index 00000000..95efcebb --- /dev/null +++ b/provider/docs/src/dom.ts @@ -0,0 +1,23 @@ +export type ParseDOM = (html: string, url: string | undefined) => Promise + +/** + * Parse DOM (works in both Node and browser). + */ +export const parseDOM: ParseDOM = + typeof DOMParser === 'undefined' + ? async (html, url) => { + const { JSDOM } = await import('jsdom') + return new JSDOM(html, { url }).window.document + } + : (html, url) => { + const document = new DOMParser().parseFromString(html, 'text/html') + + // Set base URL. + if (url && document.head.querySelectorAll('base').length === 0) { + const baseEl = document.createElement('base') + baseEl.setAttribute('href', url) + document.head.append(baseEl) + } + + return Promise.resolve(document) + }