From d573fbae5a63e9f87751d3cab0d0a83c8f24cb7e Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Tue, 26 Dec 2023 01:05:58 -0600 Subject: [PATCH] wip --- client/web-playground/src/demo/settings.ts | 6 ++-- client/web-playground/vite.config.ts | 1 + lib/client/src/api.ts | 1 + pnpm-lock.yaml | 10 ++++++ provider/docs/README.md | 2 +- provider/docs/package.json | 1 + .../src/corpus/doc/contentExtractor.test.ts | 4 +-- .../docs/src/corpus/doc/contentExtractor.ts | 27 ++++++++++++--- provider/docs/src/corpus/index.ts | 4 +-- provider/docs/src/provider/multiplex.ts | 26 +++++++++++++-- provider/docs/src/provider/provider.ts | 33 ++++++++++++------- 11 files changed, 88 insertions(+), 27 deletions(-) diff --git a/client/web-playground/src/demo/settings.ts b/client/web-playground/src/demo/settings.ts index b30e8117..5326111f 100644 --- a/client/web-playground/src/demo/settings.ts +++ b/client/web-playground/src/demo/settings.ts @@ -3,8 +3,10 @@ import { type ProviderSettings } from '@opencodegraph/client' async function getProviders(): Promise> { const providerSettings: Record = { '../../../../provider/hello-world/index.ts': false, - '../../../../provider/docs/src/provider/provider.ts': - {} satisfies import('@opencodegraph/provider-docs').Settings, + '../../../../provider/docs/src/provider/provider.ts': { + entryPage: 'http://localhost:5800/docs/start', + prefix: 'http://localhost:5800/docs', + } satisfies import('@opencodegraph/provider-docs').Settings, '../../../../provider/links/index.ts': { links: [ { diff --git a/client/web-playground/vite.config.ts b/client/web-playground/vite.config.ts index ffc1c859..3c66f7b1 100644 --- a/client/web-playground/vite.config.ts +++ b/client/web-playground/vite.config.ts @@ -19,6 +19,7 @@ export default defineConfig(({ mode }) => ({ ] : [], }, + define: {}, css: { devSourcemap: true, modules: { diff --git a/lib/client/src/api.ts b/lib/client/src/api.ts index 23f4f538..f7304149 100644 --- a/lib/client/src/api.ts +++ b/lib/client/src/api.ts @@ -56,6 +56,7 @@ export function observeAnnotations( emitPartial ? startWith(null) : tap(), catchError(error => { logger?.(`failed to get annotations: ${error}`) + console.error(error) return of(null) }) ) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index dc49fe7b..cf99c712 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -482,6 +482,9 @@ importers: '@xenova/transformers': specifier: ^2.12.1 version: 2.12.1 + buffer: + specifier: ^6.0.3 + version: 6.0.3 env-paths: specifier: ^3.0.0 version: 3.0.0 @@ -8192,6 +8195,13 @@ packages: base64-js: 1.5.1 ieee754: 1.2.1 + /buffer@6.0.3: + resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==} + dependencies: + base64-js: 1.5.1 + ieee754: 1.2.1 + dev: false + /builtin-modules@3.3.0: resolution: {integrity: sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==} engines: {node: '>=6'} diff --git a/provider/docs/README.md b/provider/docs/README.md index 3cb595cd..26b6ac12 100644 --- a/provider/docs/README.md +++ b/provider/docs/README.md @@ -56,7 +56,7 @@ time p run -s docs-query 'making provider work in vscode' $(find ../../web/conte TODOs: -- make it slurp up a base URL of docs +- deal with different content types (markdown/html) differently - make it slurp up gdocs/confluence/markdown in repos - show OCG annotations (but in a way that doesn't overlay lines in the file, is more passive?) - show a demo of Cody working with this diff --git a/provider/docs/package.json b/provider/docs/package.json index 168f0adb..431c8b72 100644 --- a/provider/docs/package.json +++ b/provider/docs/package.json @@ -28,6 +28,7 @@ "@mozilla/readability": "^0.5.0", "@opencodegraph/provider": "workspace:*", "@xenova/transformers": "^2.12.1", + "buffer": "^6.0.3", "env-paths": "^3.0.0", "jsdom": "^23.0.1", "onnxruntime-web": "*" diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts index fb6a5ddc..eda2de80 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.test.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts @@ -2,9 +2,9 @@ import { describe, expect, test } from 'vitest' import { Content, extractContentUsingMozillaReadability } from './contentExtractor' describe('extractContentUsingMozillaReadability', () => { - test('extracts content', () => + test('extracts content', async () => expect( - extractContentUsingMozillaReadability.extractContent({ + await extractContentUsingMozillaReadability.extractContent({ id: 1, text: 'Bar - MySite

Bar

\n

Baz

', }) diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts index eae82a80..7b09a52c 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -1,5 +1,4 @@ import { Readability } from '@mozilla/readability' -import { JSDOM } from 'jsdom' import { type Doc } from './doc' export interface Content { @@ -23,13 +22,33 @@ export interface Content { export interface ContentExtractor { id: string - extractContent(doc: Doc): Content | null + extractContent(doc: Doc): Promise } export const extractContentUsingMozillaReadability: ContentExtractor = { id: 'mozillaReadability', - extractContent(doc) { - const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { + async extractContent(doc) { + type ParseDOM = (html: string, url: string | undefined) => Promise + const parseDOM: ParseDOM = + typeof DOMParser === 'undefined' + ? async (html, url) => { + const { JSDOM } = await import('jsdom') + return new JSDOM(html, { url }).window.document + } + : (html, url) => { + const document = new DOMParser().parseFromString(html, 'text/html') + + // Set base URL. + if (url && document.head.querySelectorAll('base').length === 0) { + const baseEl = document.createElement('base') + baseEl.setAttribute('href', url) + document.head.append(baseEl) + } + + return Promise.resolve(document) + } + + const info = new Readability(await parseDOM(doc.text, doc.url), { charThreshold: 500, }).parse() return info diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index 6c5fc0fd..dda55855 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -73,7 +73,5 @@ function cachedExtractContent( if (!extractor) { return Promise.resolve(null) } - return memo(cache, `${doc.url}:${doc.text}`, `extractContent:${extractor.id}`, () => - Promise.resolve(extractor.extractContent(doc)) - ) + return memo(cache, `${doc.url}:${doc.text}`, `extractContent:${extractor.id}`, () => extractor.extractContent(doc)) } diff --git a/provider/docs/src/provider/multiplex.ts b/provider/docs/src/provider/multiplex.ts index b5360c75..aa2144bb 100644 --- a/provider/docs/src/provider/multiplex.ts +++ b/provider/docs/src/provider/multiplex.ts @@ -4,10 +4,30 @@ import { OpenCodeGraphProvider } from '@opencodegraph/provider' * @template S The settings type. */ export function multiplex( - createProvider: (settings: S) => OpenCodeGraphProvider + createProvider: (settings: S) => Promise> ): OpenCodeGraphProvider { + const providerCache = new Map>>() + + function getProvider(settings: S): Promise> { + const key = JSON.stringify(settings) + let provider = providerCache.get(key) + if (!provider) { + provider = createProvider(settings) + providerCache.set(key, provider) + + // Prevent accidental memory leaks in case `settings` keeps changing. + // + // TODO(sqs): use an LRU cache or something + const MAX_SIZE = 10 + if (providerCache.size > MAX_SIZE) { + throw new Error(`provider cache is too big (max size ${MAX_SIZE})`) + } + } + return provider + } + return { - capabilities: (params, settings) => createProvider(settings).capabilities(params, settings), - annotations: (params, settings) => createProvider(settings).annotations(params, settings), + capabilities: (params, settings) => getProvider(settings).then(p => p.capabilities(params, settings)), + annotations: (params, settings) => getProvider(settings).then(p => p.annotations(params, settings)), } } diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index ecd29ecc..90b2cd38 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -5,12 +5,18 @@ import { type CapabilitiesParams, type CapabilitiesResult, } from '@opencodegraph/provider' -import { createCorpus } from '../corpus' +import { indexCorpus } from '../corpus' import { createWebStorageCorpusCache } from '../corpus/cache/localStorage' +import { corpusData } from '../corpus/data' +import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor' +import { createWebCorpusSource } from '../corpus/source/web/webCorpusSource' import { multiplex } from './multiplex' /** Settings for the docs OpenCodeGraph provider. */ -export interface Settings {} +export interface Settings { + entryPage: string + prefix: string +} const CORPUS_CACHE = typeof localStorage !== 'undefined' ? createWebStorageCorpusCache(localStorage, 'ocg-provider-docs') : undefined @@ -19,16 +25,19 @@ const CORPUS_CACHE = * An [OpenCodeGraph](https://opencodegraph.org) provider that adds contextual documentation to your * code from an existing documentation corpus. */ -export default multiplex(settings => { - const corpus = createCorpus( - [ - { id: 1, text: 'Signinpage is cool allowSignup authProviders' }, - { id: 2, text: 'Bazel build here is how to do it' }, - ], - { - cache: CORPUS_CACHE, - } +export default multiplex(async settings => { + const data = corpusData( + await createWebCorpusSource({ + entryPage: new URL(settings.entryPage), + prefix: new URL(settings.prefix), + logger: message => console.log(message), + }).documents() ) + const index = await indexCorpus(data, { + cache: CORPUS_CACHE, + contentExtractor: extractContentUsingMozillaReadability, + }) + return { capabilities(_params: CapabilitiesParams, settings: Settings): CapabilitiesResult { return {} @@ -36,7 +45,7 @@ export default multiplex(settings => { async annotations(params: AnnotationsParams, settings: Settings): Promise { console.time('search') - const searchResults = await corpus.search(params.content) + const searchResults = await index.search(params.content) console.timeEnd('search') const result: AnnotationsResult = { items: [], annotations: [] }