diff --git a/provider/docs/bin/docs-query.ts b/provider/docs/bin/docs-query.ts index 8c25e04e..3b10f938 100644 --- a/provider/docs/bin/docs-query.ts +++ b/provider/docs/bin/docs-query.ts @@ -33,7 +33,7 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusD const cacheDir = envPaths('opencodegraph-provider-docs').cache const fsCache = createFileSystemCorpusCache(cacheDir) -const corpus = indexCorpus(corpusData, { +const corpus = await indexCorpus(corpusData, { cache: fsCache, contentExtractor: extractContentUsingMozillaReadability, }) diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts index fea242e2..fb6a5ddc 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.test.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts @@ -4,7 +4,7 @@ import { Content, extractContentUsingMozillaReadability } from './contentExtract describe('extractContentUsingMozillaReadability', () => { test('extracts content', () => expect( - extractContentUsingMozillaReadability({ + extractContentUsingMozillaReadability.extractContent({ id: 1, text: 'Bar - MySite

Bar

\n

Baz

', }) diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts index 330e0165..eae82a80 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -21,15 +21,23 @@ export interface Content { textContent: string } -export type ContentExtractor = (doc: Doc) => Content | null +export interface ContentExtractor { + id: string + extractContent(doc: Doc): Content | null +} -export const extractContentUsingMozillaReadability: ContentExtractor = doc => { - const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { charThreshold: 500 }).parse() - return info - ? { - title: info.title, - content: info.content, - textContent: info.textContent, - } - : null +export const extractContentUsingMozillaReadability: ContentExtractor = { + id: 'mozillaReadability', + extractContent(doc) { + const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { + charThreshold: 500, + }).parse() + return info + ? { + title: info.title, + content: info.content, + textContent: info.textContent, + } + : null + }, } diff --git a/provider/docs/src/corpus/index.test.ts b/provider/docs/src/corpus/index.test.ts index 24310c0d..f68bdb39 100644 --- a/provider/docs/src/corpus/index.test.ts +++ b/provider/docs/src/corpus/index.test.ts @@ -8,7 +8,7 @@ export function doc(id: DocID, text: string): Doc { } describe('indexCorpus', () => { - test('#docs', () => { - expect(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])).docs.length).toBe(2) + test('#docs', async () => { + expect((await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2) }) }) diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index b9679c80..6c5fc0fd 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -1,4 +1,4 @@ -import { noopCache, type CorpusCache } from './cache/cache' +import { memo, noopCache, type CorpusCache } from './cache/cache' import { type CorpusData } from './data' import { chunk, type Chunk, type ChunkIndex } from './doc/chunks' import { type Content, type ContentExtractor } from './doc/contentExtractor' @@ -21,7 +21,7 @@ export interface CorpusIndex { */ export interface IndexedDoc { doc: Doc - content?: Content + content: Content | null chunks: Chunk[] } @@ -47,13 +47,14 @@ export interface IndexOptions { /** * Index a corpus. */ -export function indexCorpus( +export async function indexCorpus( data: CorpusData, { cache = noopCache, contentExtractor }: IndexOptions = { cache: noopCache } -): CorpusIndex { +): Promise { const indexedDocs: IndexedDoc[] = [] + for (const doc of data.docs) { - const content = contentExtractor?.(doc) ?? undefined + const content = await cachedExtractContent(cache, contentExtractor, doc) const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') }) @@ -63,3 +64,16 @@ export function indexCorpus( const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) } return index } + +function cachedExtractContent( + cache: CorpusCache, + extractor: ContentExtractor | undefined, + doc: Doc +): Promise { + if (!extractor) { + return Promise.resolve(null) + } + return memo(cache, `${doc.url}:${doc.text}`, `extractContent:${extractor.id}`, () => + Promise.resolve(extractor.extractContent(doc)) + ) +} diff --git a/provider/docs/src/corpus/search/embeddings.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts index f97c6241..762738f1 100644 --- a/provider/docs/src/corpus/search/embeddings.test.ts +++ b/provider/docs/src/corpus/search/embeddings.test.ts @@ -6,7 +6,7 @@ import { embeddingsSearch, embedText, similarity } from './embeddings' describe('embeddingsSearch', () => { test('finds matches', async () => { - expect(await embeddingsSearch(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual< + expect(await embeddingsSearch(await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual< CorpusSearchResult[] >([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }]) }) diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts index 6538d7f4..977f5a9e 100644 --- a/provider/docs/src/corpus/search/keyword.test.ts +++ b/provider/docs/src/corpus/search/keyword.test.ts @@ -6,8 +6,8 @@ import { keywordSearch } from './keyword' import { calculateTFIDF } from './tfidf' describe('keywordSearch', () => { - test('finds matches', () => { - expect(keywordSearch(indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual< + test('finds matches', async () => { + expect(keywordSearch(await indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual< CorpusSearchResult[] >([ { diff --git a/provider/docs/src/corpus/search/tfidf.test.ts b/provider/docs/src/corpus/search/tfidf.test.ts index 3493696e..3e461e2b 100644 --- a/provider/docs/src/corpus/search/tfidf.test.ts +++ b/provider/docs/src/corpus/search/tfidf.test.ts @@ -3,14 +3,14 @@ import { indexCorpus } from '..' import { corpusData } from '../data' import { calculateTFIDF, createIndexForTFIDF } from './tfidf' -describe('createIndexForTFIDF', () => { +describe('createIndexForTFIDF', async () => { const data = corpusData([ { id: 1, text: 'a b c c c' }, { id: 2, text: 'b c d' }, { id: 3, text: 'c d e' }, ]) const docIDs = data.docs.map(({ id }) => id) - const index = indexCorpus(data) + const index = await indexCorpus(data) const tfidf = createIndexForTFIDF(index.docs) test('term in 1 doc', () => { diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts index b710b20a..1494cd6b 100644 --- a/provider/docs/src/e2e.test.ts +++ b/provider/docs/src/e2e.test.ts @@ -9,7 +9,7 @@ describe('e2e', () => { const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') - const corpus = indexCorpus(corpusData([{ id: 1, text: docFile }])) + const corpus = await indexCorpus(corpusData([{ id: 1, text: docFile }])) const results = await corpus.search(codeFile) roundScores(results) expect(results).toEqual([