diff --git a/provider/docs/bin/docs-query.ts b/provider/docs/bin/docs-query.ts
index 8c25e04e..3b10f938 100644
--- a/provider/docs/bin/docs-query.ts
+++ b/provider/docs/bin/docs-query.ts
@@ -33,7 +33,7 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusD
const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCorpusCache(cacheDir)
-const corpus = indexCorpus(corpusData, {
+const corpus = await indexCorpus(corpusData, {
cache: fsCache,
contentExtractor: extractContentUsingMozillaReadability,
})
diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts
index fea242e2..fb6a5ddc 100644
--- a/provider/docs/src/corpus/doc/contentExtractor.test.ts
+++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts
@@ -4,7 +4,7 @@ import { Content, extractContentUsingMozillaReadability } from './contentExtract
describe('extractContentUsingMozillaReadability', () => {
test('extracts content', () =>
expect(
- extractContentUsingMozillaReadability({
+ extractContentUsingMozillaReadability.extractContent({
id: 1,
text: '
Bar - MySiteBar
\nBaz
',
})
diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts
index 330e0165..eae82a80 100644
--- a/provider/docs/src/corpus/doc/contentExtractor.ts
+++ b/provider/docs/src/corpus/doc/contentExtractor.ts
@@ -21,15 +21,23 @@ export interface Content {
textContent: string
}
-export type ContentExtractor = (doc: Doc) => Content | null
+export interface ContentExtractor {
+ id: string
+ extractContent(doc: Doc): Content | null
+}
-export const extractContentUsingMozillaReadability: ContentExtractor = doc => {
- const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { charThreshold: 500 }).parse()
- return info
- ? {
- title: info.title,
- content: info.content,
- textContent: info.textContent,
- }
- : null
+export const extractContentUsingMozillaReadability: ContentExtractor = {
+ id: 'mozillaReadability',
+ extractContent(doc) {
+ const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, {
+ charThreshold: 500,
+ }).parse()
+ return info
+ ? {
+ title: info.title,
+ content: info.content,
+ textContent: info.textContent,
+ }
+ : null
+ },
}
diff --git a/provider/docs/src/corpus/index.test.ts b/provider/docs/src/corpus/index.test.ts
index 24310c0d..f68bdb39 100644
--- a/provider/docs/src/corpus/index.test.ts
+++ b/provider/docs/src/corpus/index.test.ts
@@ -8,7 +8,7 @@ export function doc(id: DocID, text: string): Doc {
}
describe('indexCorpus', () => {
- test('#docs', () => {
- expect(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])).docs.length).toBe(2)
+ test('#docs', async () => {
+ expect((await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2)
})
})
diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts
index b9679c80..6c5fc0fd 100644
--- a/provider/docs/src/corpus/index.ts
+++ b/provider/docs/src/corpus/index.ts
@@ -1,4 +1,4 @@
-import { noopCache, type CorpusCache } from './cache/cache'
+import { memo, noopCache, type CorpusCache } from './cache/cache'
import { type CorpusData } from './data'
import { chunk, type Chunk, type ChunkIndex } from './doc/chunks'
import { type Content, type ContentExtractor } from './doc/contentExtractor'
@@ -21,7 +21,7 @@ export interface CorpusIndex {
*/
export interface IndexedDoc {
doc: Doc
- content?: Content
+ content: Content | null
chunks: Chunk[]
}
@@ -47,13 +47,14 @@ export interface IndexOptions {
/**
* Index a corpus.
*/
-export function indexCorpus(
+export async function indexCorpus(
data: CorpusData,
{ cache = noopCache, contentExtractor }: IndexOptions = { cache: noopCache }
-): CorpusIndex {
+): Promise {
const indexedDocs: IndexedDoc[] = []
+
for (const doc of data.docs) {
- const content = contentExtractor?.(doc) ?? undefined
+ const content = await cachedExtractContent(cache, contentExtractor, doc)
const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') })
@@ -63,3 +64,16 @@ export function indexCorpus(
const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) }
return index
}
+
+function cachedExtractContent(
+ cache: CorpusCache,
+ extractor: ContentExtractor | undefined,
+ doc: Doc
+): Promise {
+ if (!extractor) {
+ return Promise.resolve(null)
+ }
+ return memo(cache, `${doc.url}:${doc.text}`, `extractContent:${extractor.id}`, () =>
+ Promise.resolve(extractor.extractContent(doc))
+ )
+}
diff --git a/provider/docs/src/corpus/search/embeddings.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts
index f97c6241..762738f1 100644
--- a/provider/docs/src/corpus/search/embeddings.test.ts
+++ b/provider/docs/src/corpus/search/embeddings.test.ts
@@ -6,7 +6,7 @@ import { embeddingsSearch, embedText, similarity } from './embeddings'
describe('embeddingsSearch', () => {
test('finds matches', async () => {
- expect(await embeddingsSearch(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual<
+ expect(await embeddingsSearch(await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual<
CorpusSearchResult[]
>([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }])
})
diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts
index 6538d7f4..977f5a9e 100644
--- a/provider/docs/src/corpus/search/keyword.test.ts
+++ b/provider/docs/src/corpus/search/keyword.test.ts
@@ -6,8 +6,8 @@ import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'
describe('keywordSearch', () => {
- test('finds matches', () => {
- expect(keywordSearch(indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual<
+ test('finds matches', async () => {
+ expect(keywordSearch(await indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual<
CorpusSearchResult[]
>([
{
diff --git a/provider/docs/src/corpus/search/tfidf.test.ts b/provider/docs/src/corpus/search/tfidf.test.ts
index 3493696e..3e461e2b 100644
--- a/provider/docs/src/corpus/search/tfidf.test.ts
+++ b/provider/docs/src/corpus/search/tfidf.test.ts
@@ -3,14 +3,14 @@ import { indexCorpus } from '..'
import { corpusData } from '../data'
import { calculateTFIDF, createIndexForTFIDF } from './tfidf'
-describe('createIndexForTFIDF', () => {
+describe('createIndexForTFIDF', async () => {
const data = corpusData([
{ id: 1, text: 'a b c c c' },
{ id: 2, text: 'b c d' },
{ id: 3, text: 'c d e' },
])
const docIDs = data.docs.map(({ id }) => id)
- const index = indexCorpus(data)
+ const index = await indexCorpus(data)
const tfidf = createIndexForTFIDF(index.docs)
test('term in 1 doc', () => {
diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts
index b710b20a..1494cd6b 100644
--- a/provider/docs/src/e2e.test.ts
+++ b/provider/docs/src/e2e.test.ts
@@ -9,7 +9,7 @@ describe('e2e', () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')
- const corpus = indexCorpus(corpusData([{ id: 1, text: docFile }]))
+ const corpus = await indexCorpus(corpusData([{ id: 1, text: docFile }]))
const results = await corpus.search(codeFile)
roundScores(results)
expect(results).toEqual([