From e1871009f701f6519ae24c0ee97b7fbbe880c475 Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Tue, 26 Dec 2023 01:31:11 -0600 Subject: [PATCH] wip --- provider/docs/README.md | 1 + provider/docs/bin/create-web-corpus.ts | 8 ++-- provider/docs/src/corpus/index.ts | 16 ++++++- provider/docs/src/corpus/source/source.ts | 20 ++++++++- .../corpus/source/web/webCorpusSource.test.ts | 4 +- .../src/corpus/source/web/webCorpusSource.ts | 45 +++++++++++++------ provider/docs/src/provider/provider.ts | 32 ++++++++----- 7 files changed, 94 insertions(+), 32 deletions(-) diff --git a/provider/docs/README.md b/provider/docs/README.md index 26b6ac12..66cc349d 100644 --- a/provider/docs/README.md +++ b/provider/docs/README.md @@ -56,6 +56,7 @@ time p run -s docs-query 'making provider work in vscode' $(find ../../web/conte TODOs: +- simplify cache interface - deal with different content types (markdown/html) differently - make it slurp up gdocs/confluence/markdown in repos - show OCG annotations (but in a way that doesn't overlay lines in the file, is more passive?) diff --git a/provider/docs/bin/create-web-corpus.ts b/provider/docs/bin/create-web-corpus.ts index f45b4a6c..35a72fee 100644 --- a/provider/docs/bin/create-web-corpus.ts +++ b/provider/docs/bin/create-web-corpus.ts @@ -6,9 +6,10 @@ const args = process.argv.slice(2) const entryPage = args[0] const prefix = args[1] +const ignore = args.slice(2) -const USAGE = `\nUsage: ${path.basename(process.argv[1])} ` -if (!entryPage || !prefix || args.length !== 2) { +const USAGE = `\nUsage: ${path.basename(process.argv[1])} [ignore]` +if (!entryPage || !prefix || args.length < 2) { console.error('Error: invalid arguments') console.error(USAGE) process.exit(1) @@ -17,10 +18,11 @@ if (!entryPage || !prefix || args.length !== 2) { const corpusSource = createWebCorpusSource({ entryPage: new URL(entryPage), prefix: new URL(prefix), + ignore, logger: message => console.error('# ' + message), }) -const data = corpusData(await corpusSource.documents()) +const data = corpusData(await corpusSource.docs()) console.error(`# ${data.docs.length} docs`) console.log(JSON.stringify(data, null, 2)) diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index dda55855..411797ee 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -13,6 +13,7 @@ export interface CorpusIndex { docs: IndexedDoc[] + doc(id: DocID): IndexedDoc search(query: string): Promise } @@ -61,7 +62,20 @@ export async function indexCorpus( indexedDocs.push({ doc, content, chunks }) } - const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) } + const index: CorpusIndex = { + data, + docs: indexedDocs, + doc(id) { + const doc = indexedDocs.find(d => d.doc.id === id) + if (!doc) { + throw new Error(`no document with id ${id} in corpus`) + } + return doc + }, + search(query) { + return multiSearch(index, query, cache) + }, + } return index } diff --git a/provider/docs/src/corpus/source/source.ts b/provider/docs/src/corpus/source/source.ts index 2eddd418..43dd91c0 100644 --- a/provider/docs/src/corpus/source/source.ts +++ b/provider/docs/src/corpus/source/source.ts @@ -1,5 +1,23 @@ +import { CorpusData } from '../data' import { Doc } from '../doc/doc' export interface CorpusSource { - documents(): Promise + docs(): Promise +} + +export function corpusDataSource(data: CorpusData | Promise): CorpusSource { + return { + docs: async () => (await data).docs, + } +} + +export function corpusDataURLSource(url: string): CorpusSource { + return corpusDataSource( + fetch(url).then(resp => { + if (!resp.ok) { + throw new Error(`failed to fetch corpus data from ${url}: ${resp.status} ${resp.statusText}`) + } + return resp.json() + }) + ) } diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.test.ts b/provider/docs/src/corpus/source/web/webCorpusSource.test.ts index 6083da4e..2778b0e8 100644 --- a/provider/docs/src/corpus/source/web/webCorpusSource.test.ts +++ b/provider/docs/src/corpus/source/web/webCorpusSource.test.ts @@ -63,7 +63,7 @@ describe('createWebCorpusSource', () => { entryPage: new URL('https://example.com/docs/entry'), prefix: new URL('https://example.com/docs'), }) - expect(await source.documents()).toEqual([ + expect(await source.docs()).toEqual([ { id: 1, text: mockPages['/docs/entry'].body, url: 'https://example.com/docs/entry' }, { id: 2, text: mockPages['/docs/foo'].body, url: 'https://example.com/docs/foo' }, { id: 3, text: mockPages['/docs/bar'].body, url: 'https://example.com/docs/bar' }, @@ -90,7 +90,7 @@ describe('createWebCorpusSource', () => { entryPage: new URL('https://example.com/a/'), prefix: new URL('https://example.com'), }) - expect(await source.documents()).toMatchObject[]>([{ id: 1, url: 'https://example.com/a' }]) + expect(await source.docs()).toMatchObject[]>([{ id: 1, url: 'https://example.com/a' }]) }) }) diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.ts b/provider/docs/src/corpus/source/web/webCorpusSource.ts index f238942c..535baf12 100644 --- a/provider/docs/src/corpus/source/web/webCorpusSource.ts +++ b/provider/docs/src/corpus/source/web/webCorpusSource.ts @@ -15,16 +15,23 @@ interface WebCorpusSourceOptions { */ prefix: URL + /** + * Exclude pages whose URL contains any of these strings. + */ + ignore?: string[] + /** * Called to print a log message. */ logger?: Logger } -export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSourceOptions): CorpusSource { +export function createWebCorpusSource({ entryPage, prefix, ignore, logger }: WebCorpusSourceOptions): CorpusSource { return { - documents: async () => { - const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue(url => urlHasPrefix(url, prefix)) + docs: async () => { + const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue( + url => urlHasPrefix(url, prefix) && !ignore?.some(ignore => url.href.includes(ignore)) + ) if (!shouldCrawlURL(entryPage)) { throw new Error(`web corpus entryPage (${entryPage}) does not start with prefix (${prefix})`) @@ -61,15 +68,16 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo const canonicalURLStr = dom.querySelector("head > link[rel='canonical']")?.href if (canonicalURLStr && canonicalURLStr !== url.href) { - const canonicalURL = new URL(canonicalURLStr) - - // Only trust the canonical URL if it's same-origin, to avoid letting other - // sites pollute this corpus. - if (canonicalURL.origin === url.origin) { - logger?.(`- Found canonical URL: ${canonicalURL}`) - url = canonicalURL - if (!shouldCrawlURL(url)) { - continue + const canonicalURL = parseURL(canonicalURLStr) + if (canonicalURL) { + // Only trust the canonical URL if it's same-origin, to avoid letting other + // sites pollute this corpus. + if (canonicalURL.origin === url.origin) { + logger?.(`- Found canonical URL: ${canonicalURL}`) + url = canonicalURL + if (!shouldCrawlURL(url)) { + continue + } } } } @@ -83,7 +91,10 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo const pageLinks = dom.querySelectorAll('a[href]') logger?.(`- Found ${pageLinks.length} links on page`) for (const link of pageLinks) { - enqueueURL(new URL(link.href)) + const linkURL = parseURL(link.href) + if (linkURL) { + enqueueURL(linkURL) + } } } @@ -92,6 +103,14 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo } } +function parseURL(urlStr: string): URL | undefined { + try { + return new URL(urlStr) + } catch { + return undefined + } +} + export function urlHasPrefix(url: URL, prefix: URL): boolean { // Disallow username and password. if (url.username || url.password) { diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 90b2cd38..22978a16 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -9,13 +9,19 @@ import { indexCorpus } from '../corpus' import { createWebStorageCorpusCache } from '../corpus/cache/localStorage' import { corpusData } from '../corpus/data' import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor' +import { corpusDataURLSource } from '../corpus/source/source' import { createWebCorpusSource } from '../corpus/source/web/webCorpusSource' import { multiplex } from './multiplex' /** Settings for the docs OpenCodeGraph provider. */ export interface Settings { - entryPage: string - prefix: string + corpus: + | { url: string } + | { + entryPage: string + prefix: string + ignore?: string[] + } } const CORPUS_CACHE = @@ -26,14 +32,16 @@ const CORPUS_CACHE = * code from an existing documentation corpus. */ export default multiplex(async settings => { - const data = corpusData( - await createWebCorpusSource({ - entryPage: new URL(settings.entryPage), - prefix: new URL(settings.prefix), - logger: message => console.log(message), - }).documents() - ) - const index = await indexCorpus(data, { + const source = + 'url' in settings.corpus + ? corpusDataURLSource(settings.corpus.url) + : createWebCorpusSource({ + entryPage: new URL(settings.corpus.entryPage), + prefix: new URL(settings.corpus.prefix), + ignore: settings.corpus.ignore, + logger: message => console.log(message), + }) + const index = await indexCorpus(corpusData(await source.docs()), { cache: CORPUS_CACHE, contentExtractor: extractContentUsingMozillaReadability, }) @@ -50,10 +58,10 @@ export default multiplex(async settings => { const result: AnnotationsResult = { items: [], annotations: [] } for (const [i, sr] of searchResults.entries()) { - console.log(i, sr) + const doc = index.doc(sr.doc) const item: OpenCodeGraphItem = { id: i.toString(), - title: truncate(sr.excerpt, 50), + title: truncate(doc.content?.title ?? doc.doc.url ?? 'Untitled', 50), detail: sr.excerpt, } result.items.push(item)