diff --git a/client/web-playground/src/demo/settings.ts b/client/web-playground/src/demo/settings.ts index 5326111f..5a2f3fbd 100644 --- a/client/web-playground/src/demo/settings.ts +++ b/client/web-playground/src/demo/settings.ts @@ -1,11 +1,22 @@ import { type ProviderSettings } from '@opencodegraph/client' +const USE_STORED_CORPUS = true + async function getProviders(): Promise> { const providerSettings: Record = { '../../../../provider/hello-world/index.ts': false, '../../../../provider/docs/src/provider/provider.ts': { - entryPage: 'http://localhost:5800/docs/start', - prefix: 'http://localhost:5800/docs', + corpus: USE_STORED_CORPUS + ? { + url: new URL( + 'tmp-ocg-provider-docs/sourcegraph-docs-old-web-corpus.json', + import.meta.url + ).toString(), + } + : { + entryPage: 'http://localhost:5800/docs/start', + prefix: 'http://localhost:5800/docs', + }, } satisfies import('@opencodegraph/provider-docs').Settings, '../../../../provider/links/index.ts': { links: [ diff --git a/client/web-playground/vite.config.ts b/client/web-playground/vite.config.ts index 3c66f7b1..28430b52 100644 --- a/client/web-playground/vite.config.ts +++ b/client/web-playground/vite.config.ts @@ -1,12 +1,15 @@ import { resolve } from 'path' import react from '@vitejs/plugin-react' -import { defineConfig } from 'vite' +import { defineConfig, searchForWorkspaceRoot } from 'vite' + +// TODO(sqs): un-hardcode +const docsProviderDataDir = resolve('/Users/sqs/tmp/ocg-provider-docs') export default defineConfig(({ mode }) => ({ plugins: [react()], resolve: { - alias: - mode === 'development' + alias: [ + ...(mode === 'development' ? [ // In dev mode, build from TypeScript sources so we don't need to run `tsc -b` // in the background. @@ -17,7 +20,12 @@ export default defineConfig(({ mode }) => ({ replacement: '$1/src/index', }, ] - : [], + : []), + { + find: 'tmp-ocg-provider-docs', + replacement: docsProviderDataDir, + }, + ], }, define: {}, css: { @@ -28,6 +36,9 @@ export default defineConfig(({ mode }) => ({ }, server: { port: 5900, + fs: { + allow: [searchForWorkspaceRoot(process.cwd()), docsProviderDataDir], + }, }, build: { emptyOutDir: false, diff --git a/provider/docs/README.md b/provider/docs/README.md index 66cc349d..111fe70a 100644 --- a/provider/docs/README.md +++ b/provider/docs/README.md @@ -1,4 +1,4 @@ -# Docs context provider for OpenCodeGraph +¿# Docs context provider for OpenCodeGraph This is a context provider for [OpenCodeGraph](https://opencodegraph.org) that adds contextual documentation to your code from an existing documentation corpus. @@ -56,6 +56,8 @@ time p run -s docs-query 'making provider work in vscode' $(find ../../web/conte TODOs: +- use indexeddb for more storage https://stackoverflow.com/questions/5663166/is-there-a-way-to-increase-the-size-of-localstorage-in-google-chrome-to-avoid-qu +- use worker for onnx https://huggingface.co/docs/transformers.js/tutorials/react#step-4-connecting-everything-together - simplify cache interface - deal with different content types (markdown/html) differently - make it slurp up gdocs/confluence/markdown in repos diff --git a/provider/docs/src/corpus/cache/localStorage.ts b/provider/docs/src/corpus/cache/localStorage.ts index e12d763f..67560a10 100644 --- a/provider/docs/src/corpus/cache/localStorage.ts +++ b/provider/docs/src/corpus/cache/localStorage.ts @@ -22,7 +22,12 @@ export function createWebStorageCorpusCache(storage: Storage, keyPrefix: string) } }, async set(contentID, key, value) { - storage.setItem(storageKey(contentID, key), JSON.stringify(value)) + const valueData = JSON.stringify(value) + try { + storage.setItem(storageKey(contentID, key), valueData) + } catch (error) { + // console.error(`failed to store data for ${contentID}:${key}`, error) + } }, } } diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index 411797ee..0d854e73 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -55,7 +55,8 @@ export async function indexCorpus( const indexedDocs: IndexedDoc[] = [] for (const doc of data.docs) { - const content = await cachedExtractContent(cache, contentExtractor, doc) + const USE_NOOP_CACHE = typeof window !== 'undefined' // TODO(sqs): dont cache in browser + const content = await cachedExtractContent(USE_NOOP_CACHE ? noopCache : cache, contentExtractor, doc) const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') }) diff --git a/provider/docs/src/corpus/search/embeddings.ts b/provider/docs/src/corpus/search/embeddings.ts index 82bbdf55..733934ed 100644 --- a/provider/docs/src/corpus/search/embeddings.ts +++ b/provider/docs/src/corpus/search/embeddings.ts @@ -49,13 +49,21 @@ function cachedEmbedText(text: string, cache: CorpusCache): Promise { - const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {}) - const out = await pipe(text, { pooling: 'mean', normalize: true }) - return out.data + try { + console.time('embed') + const out = await pipe(text, { pooling: 'mean', normalize: true }) + console.timeEnd('embed') + return out.data + } catch (error) { + console.log(error) + throw error + } } /** diff --git a/provider/docs/src/corpus/source/source.ts b/provider/docs/src/corpus/source/source.ts index 43dd91c0..8c08895d 100644 --- a/provider/docs/src/corpus/source/source.ts +++ b/provider/docs/src/corpus/source/source.ts @@ -17,6 +17,9 @@ export function corpusDataURLSource(url: string): CorpusSource { if (!resp.ok) { throw new Error(`failed to fetch corpus data from ${url}: ${resp.status} ${resp.statusText}`) } + if (!resp.headers.get('Content-Type')?.includes('json')) { + throw new Error(`corpus data from ${url} is not JSON`) + } return resp.json() }) ) diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 22978a16..811e65ab 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -58,11 +58,15 @@ export default multiplex(async settings => { const result: AnnotationsResult = { items: [], annotations: [] } for (const [i, sr] of searchResults.entries()) { + const MAX_RESULTS = 4 + if (i >= MAX_RESULTS) break + const doc = index.doc(sr.doc) const item: OpenCodeGraphItem = { id: i.toString(), - title: truncate(doc.content?.title ?? doc.doc.url ?? 'Untitled', 50), - detail: sr.excerpt, + title: truncate(doc.content?.title || doc.doc.url || 'Untitled', 50), + detail: truncate(doc.content?.textContent || sr.excerpt, 100), + url: doc.doc.url, } result.items.push(item) result.annotations.push({